diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index 184fed2268e81..657e7f1e7f0f7 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -79,6 +79,7 @@ jobs: path: | **/test-results.xml **/*.abilist + **/CMakeConfigureLog.yaml **/CMakeError.log **/CMakeOutput.log **/crash_diagnostics/* @@ -123,6 +124,7 @@ jobs: path: | **/test-results.xml **/*.abilist + **/CMakeConfigureLog.yaml **/CMakeError.log **/CMakeOutput.log **/crash_diagnostics/* @@ -188,6 +190,7 @@ jobs: path: | **/test-results.xml **/*.abilist + **/CMakeConfigureLog.yaml **/CMakeError.log **/CMakeOutput.log **/crash_diagnostics/* @@ -230,6 +233,7 @@ jobs: path: | **/test-results.xml **/*.abilist + **/CMakeConfigureLog.yaml **/CMakeError.log **/CMakeOutput.log **/crash_diagnostics/* diff --git a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp index adb26ade955c5..7cc4fe519d3a6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "ReturnConstRefFromParameterCheck.h" +#include "clang/AST/Expr.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchers.h" @@ -15,19 +16,24 @@ using namespace clang::ast_matchers; namespace clang::tidy::bugprone { void ReturnConstRefFromParameterCheck::registerMatchers(MatchFinder *Finder) { + const auto DRef = ignoringParens( + declRefExpr( + to(parmVarDecl(hasType(hasCanonicalType( + qualType(lValueReferenceType(pointee( + qualType(isConstQualified())))) + .bind("type")))) + .bind("param"))) + .bind("dref")); + const auto Func = + functionDecl(hasReturnTypeLoc(loc( + qualType(hasCanonicalType(equalsBoundNode("type")))))) + .bind("func"); + + Finder->addMatcher(returnStmt(hasReturnValue(DRef), hasAncestor(Func)), this); Finder->addMatcher( - returnStmt( - hasReturnValue(declRefExpr( - to(parmVarDecl(hasType(hasCanonicalType( - qualType(lValueReferenceType(pointee( - qualType(isConstQualified())))) - .bind("type")))) - .bind("param")))), - hasAncestor( - functionDecl(hasReturnTypeLoc(loc(qualType( - hasCanonicalType(equalsBoundNode("type")))))) - .bind("func"))) - .bind("ret"), + returnStmt(hasReturnValue(ignoringParens(conditionalOperator( + eachOf(hasTrueExpression(DRef), hasFalseExpression(DRef)), + hasAncestor(Func))))), this); } @@ -85,8 +91,8 @@ void ReturnConstRefFromParameterCheck::check( const MatchFinder::MatchResult &Result) { const auto *FD = Result.Nodes.getNodeAs("func"); const auto *PD = Result.Nodes.getNodeAs("param"); - const auto *R = Result.Nodes.getNodeAs("ret"); - const SourceRange Range = R->getRetValue()->getSourceRange(); + const auto *DRef = Result.Nodes.getNodeAs("dref"); + const SourceRange Range = DRef->getSourceRange(); if (Range.isInvalid()) return; diff --git a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp index 759cdd44fd658..d8cfea534e55f 100644 --- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp @@ -126,7 +126,7 @@ struct CognitiveComplexity final { // Limit of 25 is the "upstream"'s default. static constexpr unsigned DefaultLimit = 25U; - // Based on the publicly-avaliable numbers for some big open-source projects + // Based on the publicly-available numbers for some big open-source projects // https://sonarcloud.io/projects?languages=c%2Ccpp&size=5 we can estimate: // value ~20 would result in no allocs for 98% of functions, ~12 for 96%, ~10 // for 91%, ~8 for 88%, ~6 for 84%, ~4 for 77%, ~2 for 64%, and ~1 for 37%. diff --git a/clang-tools-extra/clangd/ModulesBuilder.cpp b/clang-tools-extra/clangd/ModulesBuilder.cpp index 1eeff468ef123..97f67ddf5495a 100644 --- a/clang-tools-extra/clangd/ModulesBuilder.cpp +++ b/clang-tools-extra/clangd/ModulesBuilder.cpp @@ -12,6 +12,7 @@ #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Serialization/ASTReader.h" +#include "clang/Serialization/InMemoryModuleCache.h" namespace clang { namespace clangd { @@ -127,50 +128,68 @@ struct ModuleFile { std::string ModuleFilePath; }; -bool IsModuleFileUpToDate( - PathRef ModuleFilePath, - const PrerequisiteModules &RequisiteModules) { -IntrusiveRefCntPtr Diags = - CompilerInstance::createDiagnostics(new DiagnosticOptions()); - +bool IsModuleFileUpToDate(PathRef ModuleFilePath, + const PrerequisiteModules &RequisiteModules, + llvm::IntrusiveRefCntPtr VFS) { auto HSOpts = std::make_shared(); RequisiteModules.adjustHeaderSearchOptions(*HSOpts); HSOpts->ForceCheckCXX20ModulesInputFiles = true; HSOpts->ValidateASTInputFilesContent = true; + clang::clangd::IgnoreDiagnostics IgnoreDiags; + IntrusiveRefCntPtr Diags = + CompilerInstance::createDiagnostics(new DiagnosticOptions, &IgnoreDiags, + /*ShouldOwnClient=*/false); + + LangOptions LangOpts; + LangOpts.SkipODRCheckInGMF = true; + + FileManager FileMgr(FileSystemOptions(), VFS); + + SourceManager SourceMgr(*Diags, FileMgr); + + HeaderSearch HeaderInfo(HSOpts, SourceMgr, *Diags, LangOpts, + /*Target=*/nullptr); + + TrivialModuleLoader ModuleLoader; + Preprocessor PP(std::make_shared(), *Diags, LangOpts, + SourceMgr, HeaderInfo, ModuleLoader); + + IntrusiveRefCntPtr ModuleCache = new InMemoryModuleCache; PCHContainerOperations PCHOperations; - std::unique_ptr Unit = ASTUnit::LoadFromASTFile( - ModuleFilePath.str(), PCHOperations.getRawReader(), ASTUnit::LoadASTOnly, - Diags, FileSystemOptions(), std::move(HSOpts)); + ASTReader Reader(PP, *ModuleCache, /*ASTContext=*/nullptr, + PCHOperations.getRawReader(), {}); - if (!Unit) - return false; + // We don't need any listener here. By default it will use a validator + // listener. + Reader.setListener(nullptr); - auto Reader = Unit->getASTReader(); - if (!Reader) + if (Reader.ReadAST(ModuleFilePath, serialization::MK_MainFile, + SourceLocation(), + ASTReader::ARR_None) != ASTReader::Success) return false; bool UpToDate = true; - Reader->getModuleManager().visit([&](serialization::ModuleFile &MF) -> bool { - Reader->visitInputFiles( + Reader.getModuleManager().visit([&](serialization::ModuleFile &MF) -> bool { + Reader.visitInputFiles( MF, /*IncludeSystem=*/false, /*Complain=*/false, [&](const serialization::InputFile &IF, bool isSystem) { if (!IF.getFile() || IF.isOutOfDate()) UpToDate = false; }); - return !UpToDate; }); - return UpToDate; } bool IsModuleFilesUpToDate( llvm::SmallVector ModuleFilePaths, - const PrerequisiteModules &RequisiteModules) { - return llvm::all_of(ModuleFilePaths, [&RequisiteModules](auto ModuleFilePath) { - return IsModuleFileUpToDate(ModuleFilePath, RequisiteModules); - }); + const PrerequisiteModules &RequisiteModules, + llvm::IntrusiveRefCntPtr VFS) { + return llvm::all_of( + ModuleFilePaths, [&RequisiteModules, VFS](auto ModuleFilePath) { + return IsModuleFileUpToDate(ModuleFilePath, RequisiteModules, VFS); + }); } // StandalonePrerequisiteModules - stands for PrerequisiteModules for which all @@ -347,7 +366,7 @@ bool StandalonePrerequisiteModules::canReuse( SmallVector BMIPaths; for (auto &MF : RequiredModules) BMIPaths.push_back(MF.ModuleFilePath); - return IsModuleFilesUpToDate(BMIPaths, *this); + return IsModuleFilesUpToDate(BMIPaths, *this, VFS); } } // namespace clangd diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp index d1d744a21cfd5..91ae9d3003a97 100644 --- a/clang-tools-extra/clangd/index/SymbolCollector.cpp +++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp @@ -335,9 +335,10 @@ class SymbolCollector::HeaderFileURICache { } struct FrameworkHeaderPath { - // Path to the framework directory containing the Headers/PrivateHeaders - // directories e.g. /Frameworks/Foundation.framework/ - llvm::StringRef HeadersParentDir; + // Path to the frameworks directory containing the .framework directory. + llvm::StringRef FrameworkParentDir; + // Name of the framework. + llvm::StringRef FrameworkName; // Subpath relative to the Headers or PrivateHeaders dir, e.g. NSObject.h // Note: This is NOT relative to the `HeadersParentDir`. llvm::StringRef HeaderSubpath; @@ -351,19 +352,17 @@ class SymbolCollector::HeaderFileURICache { path::reverse_iterator I = path::rbegin(Path); path::reverse_iterator Prev = I; path::reverse_iterator E = path::rend(Path); + FrameworkHeaderPath HeaderPath; while (I != E) { - if (*I == "Headers") { - FrameworkHeaderPath HeaderPath; - HeaderPath.HeadersParentDir = Path.substr(0, I - E); + if (*I == "Headers" || *I == "PrivateHeaders") { HeaderPath.HeaderSubpath = Path.substr(Prev - E); - HeaderPath.IsPrivateHeader = false; - return HeaderPath; - } - if (*I == "PrivateHeaders") { - FrameworkHeaderPath HeaderPath; - HeaderPath.HeadersParentDir = Path.substr(0, I - E); - HeaderPath.HeaderSubpath = Path.substr(Prev - E); - HeaderPath.IsPrivateHeader = true; + HeaderPath.IsPrivateHeader = *I == "PrivateHeaders"; + if (++I == E) + break; + HeaderPath.FrameworkName = *I; + if (!HeaderPath.FrameworkName.consume_back(".framework")) + break; + HeaderPath.FrameworkParentDir = Path.substr(0, I - E); return HeaderPath; } Prev = I; @@ -379,26 +378,27 @@ class SymbolCollector::HeaderFileURICache { // which should be used instead of directly // importing the header. std::optional - getFrameworkUmbrellaSpelling(llvm::StringRef Framework, - const HeaderSearch &HS, + getFrameworkUmbrellaSpelling(const HeaderSearch &HS, FrameworkHeaderPath &HeaderPath) { + StringRef Framework = HeaderPath.FrameworkName; auto Res = CacheFrameworkToUmbrellaHeaderSpelling.try_emplace(Framework); auto *CachedSpelling = &Res.first->second; if (!Res.second) { return HeaderPath.IsPrivateHeader ? CachedSpelling->PrivateHeader : CachedSpelling->PublicHeader; } - SmallString<256> UmbrellaPath(HeaderPath.HeadersParentDir); - llvm::sys::path::append(UmbrellaPath, "Headers", Framework + ".h"); + SmallString<256> UmbrellaPath(HeaderPath.FrameworkParentDir); + llvm::sys::path::append(UmbrellaPath, Framework + ".framework", "Headers", + Framework + ".h"); llvm::vfs::Status Status; auto StatErr = HS.getFileMgr().getNoncachedStatValue(UmbrellaPath, Status); if (!StatErr) CachedSpelling->PublicHeader = llvm::formatv("<{0}/{0}.h>", Framework); - UmbrellaPath = HeaderPath.HeadersParentDir; - llvm::sys::path::append(UmbrellaPath, "PrivateHeaders", - Framework + "_Private.h"); + UmbrellaPath = HeaderPath.FrameworkParentDir; + llvm::sys::path::append(UmbrellaPath, Framework + ".framework", + "PrivateHeaders", Framework + "_Private.h"); StatErr = HS.getFileMgr().getNoncachedStatValue(UmbrellaPath, Status); if (!StatErr) @@ -414,8 +414,7 @@ class SymbolCollector::HeaderFileURICache { // give if the umbrella header exists, otherwise // . std::optional - getFrameworkHeaderIncludeSpelling(FileEntryRef FE, llvm::StringRef Framework, - HeaderSearch &HS) { + getFrameworkHeaderIncludeSpelling(FileEntryRef FE, HeaderSearch &HS) { auto Res = CachePathToFrameworkSpelling.try_emplace(FE.getName()); auto *CachedHeaderSpelling = &Res.first->second; if (!Res.second) @@ -429,13 +428,15 @@ class SymbolCollector::HeaderFileURICache { return std::nullopt; } if (auto UmbrellaSpelling = - getFrameworkUmbrellaSpelling(Framework, HS, *HeaderPath)) { + getFrameworkUmbrellaSpelling(HS, *HeaderPath)) { *CachedHeaderSpelling = *UmbrellaSpelling; return llvm::StringRef(*CachedHeaderSpelling); } *CachedHeaderSpelling = - llvm::formatv("<{0}/{1}>", Framework, HeaderPath->HeaderSubpath).str(); + llvm::formatv("<{0}/{1}>", HeaderPath->FrameworkName, + HeaderPath->HeaderSubpath) + .str(); return llvm::StringRef(*CachedHeaderSpelling); } @@ -454,11 +455,8 @@ class SymbolCollector::HeaderFileURICache { // Framework headers are spelled as , not // "path/FrameworkName.framework/Headers/Foo.h". auto &HS = PP->getHeaderSearchInfo(); - if (const auto *HFI = HS.getExistingFileInfo(*FE)) - if (!HFI->Framework.empty()) - if (auto Spelling = - getFrameworkHeaderIncludeSpelling(*FE, HFI->Framework, HS)) - return *Spelling; + if (auto Spelling = getFrameworkHeaderIncludeSpelling(*FE, HS)) + return *Spelling; if (!tooling::isSelfContainedHeader(*FE, PP->getSourceManager(), PP->getHeaderSearchInfo())) { diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index ccebf74e8a67e..2c71e1fcb747b 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -163,6 +163,11 @@ Changes in existing checks ` check to support integer literals as LHS and posix call as RHS of comparison. +- Improved :doc:`bugprone-return-const-ref-from-parameter + ` check to + diagnose potential dangling references when returning a ``const &`` parameter + by using the conditional operator ``cond ? var1 : var2``. + - Improved :doc:`bugprone-sizeof-expression ` check to find suspicious usages of ``sizeof()``, ``alignof()``, and ``offsetof()`` when adding or diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp index d13c127da7c2a..d83d997a455d5 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/return-const-ref-from-parameter.cpp @@ -27,6 +27,10 @@ int const &f3(TConstRef a) { return a; } int const &f4(TConst &a) { return a; } // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: returning a constant reference parameter +int const &f5(TConst &a) { return true ? a : a; } +// CHECK-MESSAGES: :[[@LINE-1]]:42: warning: returning a constant reference parameter +// CHECK-MESSAGES: :[[@LINE-2]]:46: warning: returning a constant reference parameter + template const T& tf1(const T &a) { return a; } // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: returning a constant reference parameter @@ -47,6 +51,11 @@ template const T& itf4(typename ConstRef::type a) { return a; } // CHECK-MESSAGES: :[[@LINE-1]]:54: warning: returning a constant reference parameter +template +const T& itf5(const T &a) { return true ? a : a; } +// CHECK-MESSAGES: :[[@LINE-1]]:43: warning: returning a constant reference parameter +// CHECK-MESSAGES: :[[@LINE-2]]:47: warning: returning a constant reference parameter + void instantiate(const int ¶m, const float ¶mf, int &mut_param, float &mut_paramf) { itf1(0); itf1(param); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/function-cognitive-complexity.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/function-cognitive-complexity.cpp index aad74da21553f..2f11e0c3e3171 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/function-cognitive-complexity.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/function-cognitive-complexity.cpp @@ -69,10 +69,10 @@ void unittest_false() { //----------------------------------------------------------------------------// // break does not increase cognitive complexity. -// only break LABEL does, but it is unavaliable in C or C++ +// only break LABEL does, but it is unavailable in C or C++ // continue does not increase cognitive complexity. -// only continue LABEL does, but it is unavaliable in C or C++ +// only continue LABEL does, but it is unavailable in C or C++ void unittest_b1_00() { // CHECK-NOTES: :[[@LINE-1]]:6: warning: function 'unittest_b1_00' has cognitive complexity of 33 (threshold 0) [readability-function-cognitive-complexity] diff --git a/clang/Maintainers.rst b/clang/Maintainers.rst index 35c218d8e0e8f..896b463d882d0 100644 --- a/clang/Maintainers.rst +++ b/clang/Maintainers.rst @@ -72,7 +72,7 @@ Sema Experimental new constant interpreter ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | Timm Bäder -| tbaeder\@redhat.com (em), tbaeder (Phabricator), tbaederr (GitHub), tbaeder (Discourse), tbaeder (Discord) +| tbaeder\@redhat.com (email), tbaeder (Phabricator), tbaederr (GitHub), tbaeder (Discourse), tbaeder (Discord) Modules & serialization diff --git a/clang/bindings/python/tests/cindex/test_access_specifiers.py b/clang/bindings/python/tests/cindex/test_access_specifiers.py index c1cc18ebe6e58..ca2bbd3cc8611 100644 --- a/clang/bindings/python/tests/cindex/test_access_specifiers.py +++ b/clang/bindings/python/tests/cindex/test_access_specifiers.py @@ -1,18 +1,14 @@ import os -from clang.cindex import Config + +from clang.cindex import AccessSpecifier, Config if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import AccessSpecifier -from clang.cindex import Cursor -from clang.cindex import TranslationUnit - -from .util import get_cursor -from .util import get_tu - import unittest +from .util import get_cursor, get_tu + class TestAccessSpecifiers(unittest.TestCase): def test_access_specifiers(self): diff --git a/clang/bindings/python/tests/cindex/test_cdb.py b/clang/bindings/python/tests/cindex/test_cdb.py index a5cc22796aa2a..342a544c86337 100644 --- a/clang/bindings/python/tests/cindex/test_cdb.py +++ b/clang/bindings/python/tests/cindex/test_cdb.py @@ -1,20 +1,14 @@ import os -from clang.cindex import Config + +from clang.cindex import CompilationDatabase, CompilationDatabaseError, Config if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import CompilationDatabase -from clang.cindex import CompilationDatabaseError -from clang.cindex import CompileCommands -from clang.cindex import CompileCommand -import os import gc import unittest import sys -from .util import skip_if_no_fspath -from .util import str_to_path - +from pathlib import Path kInputsDir = os.path.join(os.path.dirname(__file__), "INPUTS") @@ -31,7 +25,7 @@ def test_create_fail(self): with open(os.devnull, "wb") as null: os.dup2(null.fileno(), 2) with self.assertRaises(CompilationDatabaseError) as cm: - cdb = CompilationDatabase.fromDirectory(path) + CompilationDatabase.fromDirectory(path) os.dup2(stderr, 2) os.close(stderr) @@ -40,7 +34,7 @@ def test_create_fail(self): def test_create(self): """Check we can load a compilation database""" - cdb = CompilationDatabase.fromDirectory(kInputsDir) + CompilationDatabase.fromDirectory(kInputsDir) def test_lookup_succeed(self): """Check we get some results if the file exists in the db""" @@ -48,13 +42,10 @@ def test_lookup_succeed(self): cmds = cdb.getCompileCommands("/home/john.doe/MyProject/project.cpp") self.assertNotEqual(len(cmds), 0) - @skip_if_no_fspath def test_lookup_succeed_pathlike(self): """Same as test_lookup_succeed, but with PathLikes""" - cdb = CompilationDatabase.fromDirectory(str_to_path(kInputsDir)) - cmds = cdb.getCompileCommands( - str_to_path("/home/john.doe/MyProject/project.cpp") - ) + cdb = CompilationDatabase.fromDirectory(Path(kInputsDir)) + cmds = cdb.getCompileCommands(Path("/home/john.doe/MyProject/project.cpp")) self.assertNotEqual(len(cmds), 0) def test_all_compilecommand(self): @@ -175,7 +166,7 @@ def test_compilationDB_references(self): cmds = cdb.getCompileCommands("/home/john.doe/MyProject/project.cpp") del cdb gc.collect() - workingdir = cmds[0].directory + cmds[0].directory def test_compilationCommands_references(self): """Ensure CompilationsCommand keeps a reference to CompilationCommands""" @@ -185,4 +176,4 @@ def test_compilationCommands_references(self): cmd0 = cmds[0] del cmds gc.collect() - workingdir = cmd0.directory + cmd0.directory diff --git a/clang/bindings/python/tests/cindex/test_code_completion.py b/clang/bindings/python/tests/cindex/test_code_completion.py index 1d513dbca2536..c7a86aa82a8eb 100644 --- a/clang/bindings/python/tests/cindex/test_code_completion.py +++ b/clang/bindings/python/tests/cindex/test_code_completion.py @@ -1,14 +1,12 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, TranslationUnit if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import TranslationUnit - import unittest -from .util import skip_if_no_fspath -from .util import str_to_path +from pathlib import Path class TestCodeCompletion(unittest.TestCase): @@ -57,11 +55,10 @@ def test_code_complete(self): ] self.check_completion_results(cr, expected) - @skip_if_no_fspath def test_code_complete_pathlike(self): files = [ ( - str_to_path("fake.c"), + Path("fake.c"), """ /// Aaa. int test1; @@ -77,14 +74,14 @@ def test_code_complete_pathlike(self): ] tu = TranslationUnit.from_source( - str_to_path("fake.c"), + Path("fake.c"), ["-std=c99"], unsaved_files=files, options=TranslationUnit.PARSE_INCLUDE_BRIEF_COMMENTS_IN_CODE_COMPLETION, ) cr = tu.codeComplete( - str_to_path("fake.c"), + Path("fake.c"), 9, 1, unsaved_files=files, diff --git a/clang/bindings/python/tests/cindex/test_comment.py b/clang/bindings/python/tests/cindex/test_comment.py index 265c6d3d73de0..1ecbb42c18ffc 100644 --- a/clang/bindings/python/tests/cindex/test_comment.py +++ b/clang/bindings/python/tests/cindex/test_comment.py @@ -1,14 +1,14 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, TranslationUnit if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import TranslationUnit -from tests.cindex.util import get_cursor - import unittest +from .util import get_cursor + class TestComment(unittest.TestCase): def test_comment(self): diff --git a/clang/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py index 77d8ca415708f..4d989a7421e79 100644 --- a/clang/bindings/python/tests/cindex/test_cursor.py +++ b/clang/bindings/python/tests/cindex/test_cursor.py @@ -1,24 +1,23 @@ import os -from clang.cindex import Config + +from clang.cindex import ( + AvailabilityKind, + BinaryOperator, + Config, + CursorKind, + StorageClass, + TemplateArgumentKind, + TranslationUnit, + TypeKind, +) if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -import ctypes import gc import unittest -from clang.cindex import AvailabilityKind -from clang.cindex import CursorKind -from clang.cindex import TemplateArgumentKind -from clang.cindex import TranslationUnit -from clang.cindex import TypeKind -from clang.cindex import BinaryOperator -from clang.cindex import StorageClass -from .util import get_cursor -from .util import get_cursors -from .util import get_tu - +from .util import get_cursor, get_cursors, get_tu kInput = """\ struct s0 { @@ -170,7 +169,7 @@ def test_references(self): self.assertIsInstance(cursor.translation_unit, TranslationUnit) # If the TU was destroyed, this should cause a segfault. - parent = cursor.semantic_parent + cursor.semantic_parent def test_canonical(self): source = "struct X; struct X; struct X { int member; };" @@ -344,7 +343,7 @@ class Bar { ) self.assertEqual(len(copy_assignment_operators_cursors), 10) - self.assertTrue(len(non_copy_assignment_operators_cursors), 9) + self.assertEqual(len(non_copy_assignment_operators_cursors), 7) self.assertTrue( all( diff --git a/clang/bindings/python/tests/cindex/test_cursor_kind.py b/clang/bindings/python/tests/cindex/test_cursor_kind.py index 87199dba06ed2..3b693ff45cfd4 100644 --- a/clang/bindings/python/tests/cindex/test_cursor_kind.py +++ b/clang/bindings/python/tests/cindex/test_cursor_kind.py @@ -1,11 +1,10 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, CursorKind if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import CursorKind - import unittest diff --git a/clang/bindings/python/tests/cindex/test_diagnostics.py b/clang/bindings/python/tests/cindex/test_diagnostics.py index 041083d12c7f1..2ebd3414d692d 100644 --- a/clang/bindings/python/tests/cindex/test_diagnostics.py +++ b/clang/bindings/python/tests/cindex/test_diagnostics.py @@ -1,14 +1,13 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, Diagnostic if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import * -from .util import get_tu - import unittest +from .util import get_tu # FIXME: We need support for invalid translation units to test better. diff --git a/clang/bindings/python/tests/cindex/test_enums.py b/clang/bindings/python/tests/cindex/test_enums.py index 63b2292c5d9bd..9e7f44fcf7867 100644 --- a/clang/bindings/python/tests/cindex/test_enums.py +++ b/clang/bindings/python/tests/cindex/test_enums.py @@ -1,18 +1,18 @@ import unittest from clang.cindex import ( - TokenKind, + AccessSpecifier, + AvailabilityKind, + BinaryOperator, CursorKind, - TemplateArgumentKind, ExceptionSpecificationKind, - AvailabilityKind, - AccessSpecifier, - TypeKind, - RefQualifierKind, LinkageKind, - TLSKind, + RefQualifierKind, StorageClass, - BinaryOperator, + TemplateArgumentKind, + TLSKind, + TokenKind, + TypeKind, ) diff --git a/clang/bindings/python/tests/cindex/test_exception_specification_kind.py b/clang/bindings/python/tests/cindex/test_exception_specification_kind.py index e4742db31adbe..f7806ffad8012 100644 --- a/clang/bindings/python/tests/cindex/test_exception_specification_kind.py +++ b/clang/bindings/python/tests/cindex/test_exception_specification_kind.py @@ -1,18 +1,17 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, CursorKind, ExceptionSpecificationKind if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -import clang.cindex -from clang.cindex import ExceptionSpecificationKind -from .util import get_tu - import unittest +from .util import get_tu + def find_function_declarations(node, declarations=[]): - if node.kind == clang.cindex.CursorKind.FUNCTION_DECL: + if node.kind == CursorKind.FUNCTION_DECL: declarations.append(node) for child in node.get_children(): declarations = find_function_declarations(child, declarations) diff --git a/clang/bindings/python/tests/cindex/test_file.py b/clang/bindings/python/tests/cindex/test_file.py index 7024b0cdf11d9..14a3084ee2b47 100644 --- a/clang/bindings/python/tests/cindex/test_file.py +++ b/clang/bindings/python/tests/cindex/test_file.py @@ -1,11 +1,10 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, File, Index if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import Index, File - import unittest diff --git a/clang/bindings/python/tests/cindex/test_index.py b/clang/bindings/python/tests/cindex/test_index.py index bf29628f5e4e7..756f4bd9c7dfb 100644 --- a/clang/bindings/python/tests/cindex/test_index.py +++ b/clang/bindings/python/tests/cindex/test_index.py @@ -1,20 +1,18 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, Index, TranslationUnit if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import * -import os import unittest - kInputsDir = os.path.join(os.path.dirname(__file__), "INPUTS") class TestIndex(unittest.TestCase): def test_create(self): - index = Index.create() + Index.create() # FIXME: test Index.read diff --git a/clang/bindings/python/tests/cindex/test_linkage.py b/clang/bindings/python/tests/cindex/test_linkage.py index 4a8838276fae5..93bf43a042047 100644 --- a/clang/bindings/python/tests/cindex/test_linkage.py +++ b/clang/bindings/python/tests/cindex/test_linkage.py @@ -1,18 +1,14 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, LinkageKind if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import LinkageKind -from clang.cindex import Cursor -from clang.cindex import TranslationUnit - -from .util import get_cursor -from .util import get_tu - import unittest +from .util import get_cursor, get_tu + class TestLinkage(unittest.TestCase): def test_linkage(self): diff --git a/clang/bindings/python/tests/cindex/test_location.py b/clang/bindings/python/tests/cindex/test_location.py index 27854a312e672..bbf79126ab1f8 100644 --- a/clang/bindings/python/tests/cindex/test_location.py +++ b/clang/bindings/python/tests/cindex/test_location.py @@ -1,19 +1,20 @@ import os -from clang.cindex import Config + +from clang.cindex import ( + Config, + Cursor, + File, + SourceLocation, + SourceRange, + TranslationUnit, +) if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import Cursor -from clang.cindex import File -from clang.cindex import SourceLocation -from clang.cindex import SourceRange -from clang.cindex import TranslationUnit -from .util import get_cursor -from .util import get_tu - import unittest +from .util import get_cursor, get_tu baseInput = "int one;\nint two;\n" diff --git a/clang/bindings/python/tests/cindex/test_rewrite.py b/clang/bindings/python/tests/cindex/test_rewrite.py index 42006f57554e2..6f7e5d906172c 100644 --- a/clang/bindings/python/tests/cindex/test_rewrite.py +++ b/clang/bindings/python/tests/cindex/test_rewrite.py @@ -1,13 +1,7 @@ -import unittest import tempfile +import unittest -from clang.cindex import ( - Rewriter, - TranslationUnit, - File, - SourceLocation, - SourceRange, -) +from clang.cindex import File, Rewriter, SourceLocation, SourceRange, TranslationUnit class TestRewrite(unittest.TestCase): diff --git a/clang/bindings/python/tests/cindex/test_source_range.py b/clang/bindings/python/tests/cindex/test_source_range.py index 47d8960fcafb3..81c0a9b05cff8 100644 --- a/clang/bindings/python/tests/cindex/test_source_range.py +++ b/clang/bindings/python/tests/cindex/test_source_range.py @@ -1,11 +1,11 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, SourceLocation, SourceRange, TranslationUnit if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) import unittest -from clang.cindex import SourceLocation, SourceRange, TranslationUnit from .util import get_tu diff --git a/clang/bindings/python/tests/cindex/test_tls_kind.py b/clang/bindings/python/tests/cindex/test_tls_kind.py index b8ef74614ab03..f80a46f4d5680 100644 --- a/clang/bindings/python/tests/cindex/test_tls_kind.py +++ b/clang/bindings/python/tests/cindex/test_tls_kind.py @@ -1,18 +1,14 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, TLSKind if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import TLSKind -from clang.cindex import Cursor -from clang.cindex import TranslationUnit - -from .util import get_cursor -from .util import get_tu - import unittest +from .util import get_cursor, get_tu + class TestTLSKind(unittest.TestCase): def test_tls_kind(self): diff --git a/clang/bindings/python/tests/cindex/test_token_kind.py b/clang/bindings/python/tests/cindex/test_token_kind.py index 747d328a577dc..594f30a448d84 100644 --- a/clang/bindings/python/tests/cindex/test_token_kind.py +++ b/clang/bindings/python/tests/cindex/test_token_kind.py @@ -1,11 +1,10 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, TokenKind if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import TokenKind - import unittest diff --git a/clang/bindings/python/tests/cindex/test_tokens.py b/clang/bindings/python/tests/cindex/test_tokens.py index 2cbf42c4c6cb9..b6c1fc8b83600 100644 --- a/clang/bindings/python/tests/cindex/test_tokens.py +++ b/clang/bindings/python/tests/cindex/test_tokens.py @@ -1,19 +1,14 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, CursorKind, SourceLocation, SourceRange, TokenKind if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from clang.cindex import CursorKind -from clang.cindex import Index -from clang.cindex import SourceLocation -from clang.cindex import SourceRange -from clang.cindex import TokenKind +import unittest from .util import get_tu -import unittest - class TestTokens(unittest.TestCase): def test_token_to_cursor(self): diff --git a/clang/bindings/python/tests/cindex/test_translation_unit.py b/clang/bindings/python/tests/cindex/test_translation_unit.py index ff7213c69dd0f..56bf374241755 100644 --- a/clang/bindings/python/tests/cindex/test_translation_unit.py +++ b/clang/bindings/python/tests/cindex/test_translation_unit.py @@ -1,30 +1,28 @@ import os -from clang.cindex import Config + +from clang.cindex import ( + Config, + Cursor, + CursorKind, + File, + Index, + SourceLocation, + SourceRange, + TranslationUnit, + TranslationUnitLoadError, + TranslationUnitSaveError, +) if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) -from contextlib import contextmanager import gc -import os -import sys import tempfile import unittest +from contextlib import contextmanager +from pathlib import Path -from clang.cindex import CursorKind -from clang.cindex import Cursor -from clang.cindex import File -from clang.cindex import Index -from clang.cindex import SourceLocation -from clang.cindex import SourceRange -from clang.cindex import TranslationUnitSaveError -from clang.cindex import TranslationUnitLoadError -from clang.cindex import TranslationUnit -from .util import get_cursor -from .util import get_tu -from .util import skip_if_no_fspath -from .util import str_to_path - +from .util import get_cursor, get_tu kInputsDir = os.path.join(os.path.dirname(__file__), "INPUTS") @@ -47,7 +45,7 @@ def save_tu_pathlike(tu): Returns the filename it was saved to. """ with tempfile.NamedTemporaryFile() as t: - tu.save(str_to_path(t.name)) + tu.save(Path(t.name)) yield t.name @@ -105,24 +103,21 @@ def test_unsaved_files(self): self.assertEqual(spellings[-1], "y") def test_unsaved_files_2(self): - if sys.version_info.major >= 3: - from io import StringIO - else: - from io import BytesIO as StringIO + from io import StringIO + tu = TranslationUnit.from_source( "fake.c", unsaved_files=[("fake.c", StringIO("int x;"))] ) spellings = [c.spelling for c in tu.cursor.get_children()] self.assertEqual(spellings[-1], "x") - @skip_if_no_fspath def test_from_source_accepts_pathlike(self): tu = TranslationUnit.from_source( - str_to_path("fake.c"), + Path("fake.c"), ["-Iincludes"], unsaved_files=[ ( - str_to_path("fake.c"), + Path("fake.c"), """ #include "fake.h" int x; @@ -130,7 +125,7 @@ def test_from_source_accepts_pathlike(self): """, ), ( - str_to_path("includes/fake.h"), + Path("includes/fake.h"), """ #define SOME_DEFINE y """, @@ -192,7 +187,6 @@ def test_save(self): self.assertTrue(os.path.exists(path)) self.assertGreater(os.path.getsize(path), 0) - @skip_if_no_fspath def test_save_pathlike(self): """Ensure TranslationUnit.save() works with PathLike filename.""" @@ -234,14 +228,13 @@ def test_load(self): # Just in case there is an open file descriptor somewhere. del tu2 - @skip_if_no_fspath def test_load_pathlike(self): """Ensure TranslationUnits can be constructed from saved files - PathLike variant.""" tu = get_tu("int foo();") self.assertEqual(len(tu.diagnostics), 0) with save_tu(tu) as path: - tu2 = TranslationUnit.from_ast_file(filename=str_to_path(path)) + tu2 = TranslationUnit.from_ast_file(filename=Path(path)) self.assertEqual(len(tu2.diagnostics), 0) foo = get_cursor(tu2, "foo") @@ -268,18 +261,17 @@ def test_get_file(self): with self.assertRaises(Exception): f = tu.get_file("foobar.cpp") - @skip_if_no_fspath def test_get_file_pathlike(self): """Ensure tu.get_file() works appropriately with PathLike filenames.""" tu = get_tu("int foo();") - f = tu.get_file(str_to_path("t.c")) + f = tu.get_file(Path("t.c")) self.assertIsInstance(f, File) self.assertEqual(f.name, "t.c") with self.assertRaises(Exception): - f = tu.get_file(str_to_path("foobar.cpp")) + f = tu.get_file(Path("foobar.cpp")) def test_get_source_location(self): """Ensure tu.get_source_location() works.""" diff --git a/clang/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py index 928a9794e4213..ce05fdb1a1ebc 100644 --- a/clang/bindings/python/tests/cindex/test_type.py +++ b/clang/bindings/python/tests/cindex/test_type.py @@ -1,5 +1,6 @@ import os -from clang.cindex import Config + +from clang.cindex import Config, CursorKind, RefQualifierKind, TranslationUnit, TypeKind if "CLANG_LIBRARY_PATH" in os.environ: Config.set_library_path(os.environ["CLANG_LIBRARY_PATH"]) @@ -7,14 +8,7 @@ import gc import unittest -from clang.cindex import CursorKind -from clang.cindex import TranslationUnit -from clang.cindex import TypeKind -from clang.cindex import RefQualifierKind -from .util import get_cursor -from .util import get_cursors -from .util import get_tu - +from .util import get_cursor, get_cursors, get_tu kInput = """\ @@ -138,7 +132,7 @@ def test_references(self): self.assertIsInstance(t.translation_unit, TranslationUnit) # If the TU was destroyed, this should cause a segfault. - decl = t.get_declaration() + t.get_declaration() def testConstantArray(self): tu = get_tu(constarrayInput) @@ -459,8 +453,8 @@ def test_offset(self): (["-target", "i386-pc-win32"], (8, 16, 0, 32, 64, 96)), (["-target", "msp430-none-none"], (2, 14, 0, 32, 64, 96)), ] - for flags, values in tries: - align, total, f1, bariton, foo, bar = values + for _, values in tries: + _, _, f1, bariton, foo, bar = values tu = get_tu(source) teststruct = get_cursor(tu, "Test") children = list(teststruct.get_children()) diff --git a/clang/bindings/python/tests/cindex/util.py b/clang/bindings/python/tests/cindex/util.py index 8ba3114b35d1e..5e66a9dc82c44 100644 --- a/clang/bindings/python/tests/cindex/util.py +++ b/clang/bindings/python/tests/cindex/util.py @@ -1,18 +1,6 @@ # This file provides common utility functions for the test suite. -import os - -HAS_FSPATH = hasattr(os, "fspath") - -if HAS_FSPATH: - from pathlib import Path as str_to_path -else: - str_to_path = None - -import unittest - -from clang.cindex import Cursor -from clang.cindex import TranslationUnit +from clang.cindex import Cursor, TranslationUnit def get_tu(source, lang="c", all_warnings=False, flags=[]): @@ -81,14 +69,8 @@ def get_cursors(source, spelling): return cursors -skip_if_no_fspath = unittest.skipUnless( - HAS_FSPATH, "Requires file system path protocol / Python 3.6+" -) - __all__ = [ "get_cursor", "get_cursors", "get_tu", - "skip_if_no_fspath", - "str_to_path", ] diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst index dbd9c91ae508e..7afad5b15b2d5 100644 --- a/clang/docs/ClangFormat.rst +++ b/clang/docs/ClangFormat.rst @@ -363,8 +363,3 @@ those as well). These commands use the file paths shown in the diff output so they will only work from the root of the repository. - -Current State of Clang Format for LLVM -====================================== - -The following table :doc:`ClangFormattedStatus` shows the current status of clang-formatting for the entire LLVM source tree. diff --git a/clang/docs/ClangLinkerWrapper.rst b/clang/docs/ClangLinkerWrapper.rst index 99352863b4773..e69cdba434c93 100644 --- a/clang/docs/ClangLinkerWrapper.rst +++ b/clang/docs/ClangLinkerWrapper.rst @@ -30,14 +30,11 @@ only for the linker wrapper will be forwarded to the wrapped linker job. USAGE: clang-linker-wrapper [options] -- OPTIONS: - --bitcode-library=--= - Extra bitcode library to link --cuda-path= Set the system CUDA path --device-debug Use debugging --device-linker= or = Arguments to pass to the device linker invocation --dry-run Print program arguments without running - --embed-bitcode Embed linked bitcode in the module --help-hidden Display all available options --help Display available options (--help-hidden for more) --host-triple= Triple to use for the host compilation diff --git a/clang/docs/ClangSYCLLinker.rst b/clang/docs/ClangSYCLLinker.rst new file mode 100644 index 0000000000000..c1a794a2f65f6 --- /dev/null +++ b/clang/docs/ClangSYCLLinker.rst @@ -0,0 +1,82 @@ +======================= +Clang SYCL Linker +======================= + +.. contents:: + :local: + +.. _clang-sycl-linker: + +Introduction +============ + +This tool works as a wrapper around the SYCL device code linking process. +The purpose of this tool is to provide an interface to link SYCL device bitcode +in LLVM IR format, SYCL device bitcode in SPIR-V IR format, and native binary +objects, and then use the SPIR-V LLVM Translator tool on fully linked device +objects to produce the final output. +After the linking stage, the fully linked device code in LLVM IR format may +undergo several SYCL-specific finalization steps before the SPIR-V code +generation step. +The tool will also support the Ahead-Of-Time (AOT) compilation flow. AOT +compilation is the process of invoking the back-end at compile time to produce +the final binary, as opposed to just-in-time (JIT) compilation when final code +generation is deferred until application runtime. + +Device code linking for SYCL offloading has several known quirks that +make it difficult to use in a unified offloading setting. Two of the primary +issues are: +1. Several finalization steps are required to be run on the fully linked LLVM +IR bitcode to guarantee conformance to SYCL standards. This step is unique to +the SYCL offloading compilation flow. +2. The SPIR-V LLVM Translator tool is an external tool and hence SPIR-V IR code +generation cannot be done as part of LTO. This limitation can be lifted once +the SPIR-V backend is available as a viable LLVM backend. + +This tool has been proposed to work around these issues. + +Usage +===== + +This tool can be used with the following options. Several of these options will +be passed down to downstream tools like 'llvm-link', 'llvm-spirv', etc. + +.. code-block:: console + + OVERVIEW: A utility that wraps around the SYCL device code linking process. + This enables linking and code generation for SPIR-V JIT targets and AOT + targets. + + USAGE: clang-sycl-linker [options] + + OPTIONS: + --arch Specify the name of the target architecture. + --dry-run Print generated commands without running. + -g Specify that this was a debug compile. + -help-hidden Display all available options + -help Display available options (--help-hidden for more) + --library-path= Set the library path for SYCL device libraries + --device-libs= A comma separated list of device libraries that are linked during the device link + -o Path to file to write output + --save-temps Save intermediate results + --triple Specify the target triple. + --version Display the version number and exit + -v Print verbose information + -spirv-dump-device-code= Directory to dump SPIR-V IR code into + -is-windows-msvc-env Specify if we are compiling under windows environment + -llvm-spirv-options= Pass options to llvm-spirv tool + --llvm-spirv-path= Set the system llvm-spirv path + +Example +======= + +This tool is intended to be invoked when targeting any of the target offloading +toolchains. When the --sycl-link option is passed to the clang driver, the +driver will invoke the linking job of the target offloading toolchain, which in +turn will invoke this tool. This tool can be used to create one or more fully +linked device images that are ready to be wrapped and linked with host code to +generate the final executable. + +.. code-block:: console + + clang-sycl-linker --triple spirv64 --arch native input.bc diff --git a/clang/docs/RealtimeSanitizer.rst b/clang/docs/RealtimeSanitizer.rst index 41b8bbb33baf1..193f5217c1a1a 100644 --- a/clang/docs/RealtimeSanitizer.rst +++ b/clang/docs/RealtimeSanitizer.rst @@ -11,10 +11,15 @@ RealtimeSanitizer (a.k.a. RTSan) is a real-time safety testing tool for C and C+ projects. RTSan can be used to detect real-time violations, i.e. calls to methods that are not safe for use in functions with deterministic run time requirements. RTSan considers any function marked with the ``[[clang::nonblocking]]`` attribute -to be a real-time function. If RTSan detects a call to ``malloc``, ``free``, -``pthread_mutex_lock``, or anything else that could have a non-deterministic -execution time in a function marked ``[[clang::nonblocking]]`` -RTSan raises an error. +to be a real-time function. At run-time, if RTSan detects a call to ``malloc``, +``free``, ``pthread_mutex_lock``, or anything else known to have a +non-deterministic execution time in a function marked ``[[clang::nonblocking]]`` +it raises an error. + +RTSan performs its analysis at run-time but shares the ``[[clang::nonblocking]]`` +attribute with the :doc:`FunctionEffectAnalysis` system, which operates at +compile-time to detect potential real-time safety violations. For comprehensive +detection of real-time safety issues, it is recommended to use both systems together. The runtime slowdown introduced by RealtimeSanitizer is negligible. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6085352dfafe6..1372e49dfac03 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -338,6 +338,19 @@ Removed Compiler Flags Attribute Changes in Clang -------------------------- +- The ``swift_attr`` can now be applied to types. To make it possible to use imported APIs + in Swift safely there has to be a way to annotate individual parameters and result types + with relevant attributes that indicate that e.g. a block is called on a particular actor + or it accepts a Sendable or global-actor (i.e. ``@MainActor``) isolated parameter. + + For example: + + .. code-block:: objc + + @interface MyService + -(void) handle: (void (^ __attribute__((swift_attr("@Sendable"))))(id)) handler; + @end + - Clang now disallows more than one ``__attribute__((ownership_returns(class, idx)))`` with different class names attached to one function. @@ -574,6 +587,9 @@ Bug Fixes to C++ Support (#GH95854). - Fixed an assertion failure when evaluating an invalid expression in an array initializer. (#GH112140) - Fixed an assertion failure in range calculations for conditional throw expressions. (#GH111854) +- Clang now correctly ignores previous partial specializations of member templates explicitly specialized for + an implicitly instantiated class template specialization. (#GH51051) +- Fixed an assertion failure caused by invalid enum forward declarations. (#GH112208) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -659,6 +675,8 @@ X86 Support - Supported intrinsics for ``MOVRS AND AVX10.2``. * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``. +- Support ISA of ``AMX-FP8``. +- Support ISA of ``AMX-TRANSPOSE``. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ @@ -743,6 +761,8 @@ AST Matchers - Fixed a crash when traverse lambda expr with invalid captures. (#GH106444) +- Fixed ``isInstantiated`` and ``isInTemplateInstantiation`` to also match for variable templates. (#GH110666) + - Ensure ``hasName`` matches template specializations across inline namespaces, making `matchesNodeFullSlow` and `matchesNodeFullFast` consistent. diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index 87b03438e6e0b..da4ec712dc44e 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -3584,7 +3584,7 @@ These are examples of cases that we consider safe: RefCountable* uncounted = this; // ok } -Here are some examples of situations that we warn about as they *might* be potentially unsafe. The logic is that either we're able to guarantee that an argument is safe or it's considered if not a bug then bug-prone. +Here are some examples of situations that we warn about as they *might* be potentially unsafe. The logic is that either we're able to guarantee that a local variable is safe or it's considered unsafe. .. code-block:: cpp @@ -3603,11 +3603,48 @@ Here are some examples of situations that we warn about as they *might* be poten RefCountable* uncounted = counted.get(); // warn } -We don't warn about these cases - we don't consider them necessarily safe but since they are very common and usually safe we'd introduce a lot of false positives otherwise: -- variable defined in condition part of an ```if``` statement -- variable defined in init statement condition of a ```for``` statement +alpha.webkit.UncheckedLocalVarsChecker +"""""""""""""""""""""""""""""""""""""" +The goal of this rule is to make sure that any unchecked local variable is backed by a CheckedPtr or CheckedRef with lifetime that is strictly larger than the scope of the unchecked local variable. To be on the safe side we require the scope of an unchecked variable to be embedded in the scope of CheckedPtr/CheckRef object that backs it. + +These are examples of cases that we consider safe: + + .. code-block:: cpp -For the time being we also don't warn about uninitialized uncounted local variables. + void foo1() { + CheckedPtr counted; + // The scope of uncounted is EMBEDDED in the scope of counted. + { + RefCountable* uncounted = counted.get(); // ok + } + } + + void foo2(CheckedPtr counted_param) { + RefCountable* uncounted = counted_param.get(); // ok + } + + void FooClass::foo_method() { + RefCountable* uncounted = this; // ok + } + +Here are some examples of situations that we warn about as they *might* be potentially unsafe. The logic is that either we're able to guarantee that a local variable is safe or it's considered unsafe. + + .. code-block:: cpp + + void foo1() { + RefCountable* uncounted = new RefCountable; // warn + } + + RefCountable* global_uncounted; + void foo2() { + RefCountable* uncounted = global_uncounted; // warn + } + + void foo3() { + RefPtr counted; + // The scope of uncounted is not EMBEDDED in the scope of counted. + RefCountable* uncounted = counted.get(); // warn + } Debug Checkers --------------- diff --git a/clang/docs/index.rst b/clang/docs/index.rst index 1096432813fac..3c473f93e5224 100644 --- a/clang/docs/index.rst +++ b/clang/docs/index.rst @@ -93,12 +93,12 @@ Using Clang Tools ClangCheck ClangFormat ClangFormatStyleOptions - ClangFormattedStatus ClangLinkerWrapper ClangNVLinkWrapper ClangOffloadBundler ClangOffloadPackager ClangRepl + ClangSYCLLinker Design Documents ================ diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 07b4e36f3ef05..4c1455a3e1bbf 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1719,8 +1719,15 @@ class ASTContext : public RefCountedBase { QualType getInjectedClassNameType(CXXRecordDecl *Decl, QualType TST) const; QualType getAttributedType(attr::Kind attrKind, QualType modifiedType, + QualType equivalentType, + const Attr *attr = nullptr) const; + + QualType getAttributedType(const Attr *attr, QualType modifiedType, QualType equivalentType) const; + QualType getAttributedType(NullabilityKind nullability, QualType modifiedType, + QualType equivalentType); + QualType getBTFTagAttributedType(const BTFTypeTagAttr *BTFAttr, QualType Wrapped) const; diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td index 3057669e3758b..5f3a885832e2e 100644 --- a/clang/include/clang/AST/PropertiesBase.td +++ b/clang/include/clang/AST/PropertiesBase.td @@ -76,6 +76,7 @@ def APValue : PropertyType { let PassByReference = 1; } def APValueKind : EnumPropertyType<"APValue::ValueKind">; def ArraySizeModifier : EnumPropertyType<"ArraySizeModifier">; def AttrKind : EnumPropertyType<"attr::Kind">; +def Attr : PropertyType<"const Attr *">; def AutoTypeKeyword : EnumPropertyType; def Bool : PropertyType<"bool">; def BuiltinTypeKind : EnumPropertyType<"BuiltinType::Kind">; diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index ba3161c366f4d..1bcc7ee0b70de 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -69,6 +69,7 @@ class ValueDecl; class TagDecl; class TemplateParameterList; class Type; +class Attr; enum { TypeAlignmentInBits = 4, @@ -6130,21 +6131,29 @@ class AttributedType : public Type, public llvm::FoldingSetNode { private: friend class ASTContext; // ASTContext creates these + const Attr *Attribute; + QualType ModifiedType; QualType EquivalentType; AttributedType(QualType canon, attr::Kind attrKind, QualType modified, QualType equivalent) - : Type(Attributed, canon, equivalent->getDependence()), - ModifiedType(modified), EquivalentType(equivalent) { - AttributedTypeBits.AttrKind = attrKind; - } + : AttributedType(canon, attrKind, nullptr, modified, equivalent) {} + + AttributedType(QualType canon, const Attr *attr, QualType modified, + QualType equivalent); + +private: + AttributedType(QualType canon, attr::Kind attrKind, const Attr *attr, + QualType modified, QualType equivalent); public: Kind getAttrKind() const { return static_cast(AttributedTypeBits.AttrKind); } + const Attr *getAttr() const { return Attribute; } + QualType getModifiedType() const { return ModifiedType; } QualType getEquivalentType() const { return EquivalentType; } @@ -6176,25 +6185,6 @@ class AttributedType : public Type, public llvm::FoldingSetNode { std::optional getImmediateNullability() const; - /// Retrieve the attribute kind corresponding to the given - /// nullability kind. - static Kind getNullabilityAttrKind(NullabilityKind kind) { - switch (kind) { - case NullabilityKind::NonNull: - return attr::TypeNonNull; - - case NullabilityKind::Nullable: - return attr::TypeNullable; - - case NullabilityKind::NullableResult: - return attr::TypeNullableResult; - - case NullabilityKind::Unspecified: - return attr::TypeNullUnspecified; - } - llvm_unreachable("Unknown nullability kind."); - } - /// Strip off the top-level nullability annotation on the given /// type, if it's there. /// @@ -6207,14 +6197,16 @@ class AttributedType : public Type, public llvm::FoldingSetNode { static std::optional stripOuterNullability(QualType &T); void Profile(llvm::FoldingSetNodeID &ID) { - Profile(ID, getAttrKind(), ModifiedType, EquivalentType); + Profile(ID, getAttrKind(), ModifiedType, EquivalentType, Attribute); } static void Profile(llvm::FoldingSetNodeID &ID, Kind attrKind, - QualType modified, QualType equivalent) { + QualType modified, QualType equivalent, + const Attr *attr) { ID.AddInteger(attrKind); ID.AddPointer(modified.getAsOpaquePtr()); ID.AddPointer(equivalent.getAsOpaquePtr()); + ID.AddPointer(attr); } static bool classof(const Type *T) { diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index d05072607e949..42f62695963a2 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -668,12 +668,16 @@ let Class = AttributedType in { def : Property<"equivalentType", QualType> { let Read = [{ node->getEquivalentType() }]; } - def : Property<"attribute", AttrKind> { + def : Property<"attrKind", AttrKind> { let Read = [{ node->getAttrKind() }]; } + def : Property<"attribute", Attr> { + let Read = [{ node->getAttr() }]; + } def : Creator<[{ - return ctx.getAttributedType(attribute, modifiedType, equivalentType); + return ctx.getAttributedType(attrKind, modifiedType, + equivalentType, attribute); }]>; } diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h index 54e484d41fb1c..c77140842d7a6 100644 --- a/clang/include/clang/ASTMatchers/ASTMatchers.h +++ b/clang/include/clang/ASTMatchers/ASTMatchers.h @@ -6750,7 +6750,8 @@ AST_POLYMORPHIC_MATCHER(isTemplateInstantiation, /// matches 'A(int) {...};' and 'A(unsigned) {...}'. AST_MATCHER_FUNCTION(internal::Matcher, isInstantiated) { auto IsInstantiation = decl(anyOf(cxxRecordDecl(isTemplateInstantiation()), - functionDecl(isTemplateInstantiation()))); + functionDecl(isTemplateInstantiation()), + varDecl(isTemplateInstantiation()))); return decl(anyOf(IsInstantiation, hasAncestor(IsInstantiation))); } @@ -6769,9 +6770,9 @@ AST_MATCHER_FUNCTION(internal::Matcher, isInstantiated) { /// will NOT match j += 42; as it's shared between the template definition and /// instantiation. AST_MATCHER_FUNCTION(internal::Matcher, isInTemplateInstantiation) { - return stmt( - hasAncestor(decl(anyOf(cxxRecordDecl(isTemplateInstantiation()), - functionDecl(isTemplateInstantiation()))))); + return stmt(hasAncestor(decl(anyOf(cxxRecordDecl(isTemplateInstantiation()), + functionDecl(isTemplateInstantiation()), + varDecl(isTemplateInstantiation()))))); } /// Matches explicit template specializations of function, class, or diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 47c93b48175fc..156fbd1c4442e 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2838,7 +2838,7 @@ def SwiftAsyncName : InheritableAttr { let Documentation = [SwiftAsyncNameDocs]; } -def SwiftAttr : InheritableAttr { +def SwiftAttr : DeclOrTypeAttr { let Spellings = [GNU<"swift_attr">]; let Args = [StringArgument<"Attribute">]; let Documentation = [SwiftAttrDocs]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index fbbfc4acdf391..b497cce37625c 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -4507,8 +4507,8 @@ def SwiftAttrDocs : Documentation { let Heading = "swift_attr"; let Content = [{ The ``swift_attr`` provides a Swift-specific annotation for the declaration -to which the attribute appertains to. It can be used on any declaration -in Clang. This kind of annotation is ignored by Clang as it doesn't have any +or type to which the attribute appertains to. It can be used on any declaration +or type in Clang. This kind of annotation is ignored by Clang as it doesn't have any semantic meaning in languages supported by Clang. The Swift compiler can interpret these annotations according to its own rules when importing C or Objective-C declarations. diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e887213aa945e..29001e3208515 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -282,7 +282,7 @@ TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_bf8, "fUiUif", "nc", "dot11-insts") //===----------------------------------------------------------------------===// TARGET_BUILTIN(__builtin_amdgcn_permlane16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_permlanex16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts") -TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nc", "gfx10-insts") +TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nct", "gfx10-insts") TARGET_BUILTIN(__builtin_amdgcn_s_ttracedata_imm, "vIs", "n", "gfx10-insts") //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index e1e613560167a..d95e8455a304b 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -128,6 +128,11 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16") TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") @@ -148,13 +153,25 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite") TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex") TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose") +TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose") +TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose") + TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") -TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd") +TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd") // AMX_FP16 FP16 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16") +// AMX FP8 +TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8") +TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8") +TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8") +TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8") + // RAO-INT TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint") TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint") diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 65551bd7761a9..cdfdaa01fb121 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -37,8 +37,7 @@ def warn_drv_invalid_arch_name_with_suggestion : Warning< "ignoring invalid /arch: argument '%0'; for %select{64|32}1-bit expected one of %2">, InGroup; def warn_drv_avr_mcu_not_specified : Warning< - "no target microcontroller specified on command line, cannot " - "link standard libraries, please pass -mmcu=">, + "no target microcontroller specified, please pass -mmcu=">, InGroup; def warn_drv_avr_libc_not_found: Warning< "no avr-libc installation can be found on the system, " diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 942fc557c5b94..39e4851dd3814 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -430,7 +430,7 @@ LANGOPT(ApplePragmaPack, 1, 0, "Apple gcc-compatible #pragma pack handling") LANGOPT(XLPragmaPack, 1, 0, "IBM XL #pragma pack handling") -LANGOPT(RetainCommentsFromSystemHeaders, 1, 0, "retain documentation comments from system headers in the AST") +COMPATIBLE_LANGOPT(RetainCommentsFromSystemHeaders, 1, 0, "retain documentation comments from system headers in the AST") LANGOPT(APINotes, 1, 0, "use external API notes") LANGOPT(APINotesModules, 1, 0, "use module-based external API notes") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 9d595984b63c4..805b79491e6ea 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4570,9 +4570,6 @@ def ibuiltininc : Flag<["-"], "ibuiltininc">, Group, HelpText<"Enable builtin #include directories even when -nostdinc is used " "before or after -ibuiltininc. " "Using -nobuiltininc after the option disables it">; -def index_header_map : Flag<["-"], "index-header-map">, - Visibility<[ClangOption, CC1Option]>, - HelpText<"Make the next included directory (-I or -F) an indexer header map">; def iapinotes_modules : JoinedOrSeparate<["-"], "iapinotes-modules">, Group, Visibility<[ClangOption, CC1Option]>, HelpText<"Add directory to the API notes search path referenced by module name">, MetaVarName<"">; @@ -6300,8 +6297,12 @@ def mamx_fp16 : Flag<["-"], "mamx-fp16">, Group; def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group; def mamx_int8 : Flag<["-"], "mamx-int8">, Group; def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group; +def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group; +def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group; def mamx_tile : Flag<["-"], "mamx-tile">, Group; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group; +def mamx_transpose : Flag<["-"], "mamx-transpose">, Group; +def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group; def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group; def msse : Flag<["-"], "msse">, Group; @@ -6779,7 +6780,10 @@ def fsycl : Flag<["-"], "fsycl">, def fno_sycl : Flag<["-"], "fno-sycl">, Visibility<[ClangOption, CLOption]>, Group, HelpText<"Disables SYCL kernels compilation for device">; - +def sycl_link : Flag<["--"], "sycl-link">, Flags<[HelpHidden]>, + Visibility<[ClangOption, CLOption]>, + Group, HelpText<"Perform link through clang-sycl-linker via the target " + "offloading toolchain.">; // OS-specific options let Flags = [TargetSpecific] in { defm android_pad_segment : BooleanFFlag<"android-pad-segment">, Group; diff --git a/clang/include/clang/Lex/DirectoryLookup.h b/clang/include/clang/Lex/DirectoryLookup.h index 81680d3b271e0..bb703dfad2b28 100644 --- a/clang/include/clang/Lex/DirectoryLookup.h +++ b/clang/include/clang/Lex/DirectoryLookup.h @@ -58,10 +58,6 @@ class DirectoryLookup { LLVM_PREFERRED_TYPE(LookupType_t) unsigned LookupType : 2; - /// Whether this is a header map used when building a framework. - LLVM_PREFERRED_TYPE(bool) - unsigned IsIndexHeaderMap : 1; - /// Whether we've performed an exhaustive search for module maps /// within the subdirectories of this directory. LLVM_PREFERRED_TYPE(bool) @@ -73,13 +69,12 @@ class DirectoryLookup { bool isFramework) : u(Dir), DirCharacteristic(DT), LookupType(isFramework ? LT_Framework : LT_NormalDir), - IsIndexHeaderMap(false), SearchedAllModuleMaps(false) {} + SearchedAllModuleMaps(false) {} /// This ctor *does not take ownership* of 'Map'. - DirectoryLookup(const HeaderMap *Map, SrcMgr::CharacteristicKind DT, - bool isIndexHeaderMap) + DirectoryLookup(const HeaderMap *Map, SrcMgr::CharacteristicKind DT) : u(Map), DirCharacteristic(DT), LookupType(LT_HeaderMap), - IsIndexHeaderMap(isIndexHeaderMap), SearchedAllModuleMaps(false) {} + SearchedAllModuleMaps(false) {} /// getLookupType - Return the kind of directory lookup that this is: either a /// normal directory, a framework path, or a HeaderMap. @@ -146,11 +141,6 @@ class DirectoryLookup { return getDirCharacteristic() != SrcMgr::C_User; } - /// Whether this header map is building a framework or not. - bool isIndexHeaderMap() const { - return isHeaderMap() && IsIndexHeaderMap; - } - /// LookupFile - Lookup the specified file in this search path, returning it /// if it exists or returning null if not. /// diff --git a/clang/include/clang/Lex/HeaderSearch.h b/clang/include/clang/Lex/HeaderSearch.h index df75c192c700a..a10adae17998b 100644 --- a/clang/include/clang/Lex/HeaderSearch.h +++ b/clang/include/clang/Lex/HeaderSearch.h @@ -108,16 +108,6 @@ struct HeaderFileInfo { LLVM_PREFERRED_TYPE(bool) unsigned Resolved : 1; - /// Whether this is a header inside a framework that is currently - /// being built. - /// - /// When a framework is being built, the headers have not yet been placed - /// into the appropriate framework subdirectories, and therefore are - /// provided via a header map. This bit indicates when this is one of - /// those framework headers. - LLVM_PREFERRED_TYPE(bool) - unsigned IndexHeaderMapHeader : 1; - /// Whether this file has been looked up as a header. LLVM_PREFERRED_TYPE(bool) unsigned IsValid : 1; @@ -132,15 +122,11 @@ struct HeaderFileInfo { /// external storage. LazyIdentifierInfoPtr LazyControllingMacro; - /// If this header came from a framework include, this is the name - /// of the framework. - StringRef Framework; - HeaderFileInfo() : IsLocallyIncluded(false), isImport(false), isPragmaOnce(false), DirInfo(SrcMgr::C_User), External(false), isModuleHeader(false), isTextualModuleHeader(false), isCompilingModuleHeader(false), - Resolved(false), IndexHeaderMapHeader(false), IsValid(false) {} + Resolved(false), IsValid(false) {} /// Retrieve the controlling macro for this header file, if /// any. @@ -154,6 +140,8 @@ struct HeaderFileInfo { void mergeModuleMembership(ModuleMap::ModuleHeaderRole Role); }; +static_assert(sizeof(HeaderFileInfo) <= 16); + /// An external source of header file information, which may supply /// information about header files already included. class ExternalHeaderFileInfoSource { diff --git a/clang/include/clang/Lex/HeaderSearchOptions.h b/clang/include/clang/Lex/HeaderSearchOptions.h index 83a95e9ad90a7..c85e3d2728170 100644 --- a/clang/include/clang/Lex/HeaderSearchOptions.h +++ b/clang/include/clang/Lex/HeaderSearchOptions.h @@ -35,9 +35,6 @@ enum IncludeDirGroup { /// Paths for '\#include <>' added by '-I'. Angled, - /// Like Angled, but marks header maps used when building frameworks. - IndexHeaderMap, - /// Like Angled, but marks system directories. System, diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index 3fdb39202610d..d62c9bb65fadb 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -26,6 +26,9 @@ class SemaAMDGPU : public SemaBase { bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs, + unsigned NumDataArgs); + /// Create an AMDGPUWavesPerEUAttr attribute. AMDGPUFlatWorkGroupSizeAttr * CreateAMDGPUFlatWorkGroupSizeAttr(const AttributeCommonInfo &CI, Expr *Min, diff --git a/clang/include/clang/Sema/SemaObjC.h b/clang/include/clang/Sema/SemaObjC.h index 1332eb4f4d423..791a7f45b832f 100644 --- a/clang/include/clang/Sema/SemaObjC.h +++ b/clang/include/clang/Sema/SemaObjC.h @@ -351,6 +351,10 @@ class SemaObjC : public SemaBase { ParsedAttributesView ArgAttrs; }; + ParmVarDecl *ActOnMethodParmDeclaration(Scope *S, ObjCArgInfo &ArgInfo, + int ParamIndex, + bool MethodDefinition); + Decl *ActOnMethodDeclaration( Scope *S, SourceLocation BeginLoc, // location of the + or -. @@ -359,7 +363,7 @@ class SemaObjC : public SemaBase { ArrayRef SelectorLocs, Selector Sel, // optional arguments. The number of types/arguments is obtained // from the Sel.getNumArgs(). - ObjCArgInfo *ArgInfo, DeclaratorChunk::ParamInfo *CParamInfo, + ParmVarDecl **ArgInfo, DeclaratorChunk::ParamInfo *CParamInfo, unsigned CNumArgs, // c-style args const ParsedAttributesView &AttrList, tok::ObjCKeywordKind MethodImplKind, bool isVariadic, bool MethodDefinition); diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index b6193866fc713..3b14a0b820331 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -44,7 +44,7 @@ namespace serialization { /// Version 4 of AST files also requires that the version control branch and /// revision match exactly, since there is no backward compatibility of /// AST files at this time. -const unsigned VERSION_MAJOR = 32; +const unsigned VERSION_MAJOR = 33; /// AST file minor version number supported by this version of /// Clang. diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 070c1c9a54f48..9c274adc59a20 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -50,6 +50,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SaveAndRestore.h" #include "llvm/Support/Timer.h" #include "llvm/Support/VersionTuple.h" #include @@ -1341,9 +1342,48 @@ class ASTReader serialization::InputFile getInputFile(ModuleFile &F, unsigned ID, bool Complain = true); + /// The buffer used as the temporary backing storage for resolved paths. + SmallString<0> PathBuf; + + /// A wrapper around StringRef that temporarily borrows the underlying buffer. + class TemporarilyOwnedStringRef { + StringRef String; + llvm::SaveAndRestore> UnderlyingBuffer; + + public: + TemporarilyOwnedStringRef(StringRef S, SmallString<0> &UnderlyingBuffer) + : String(S), UnderlyingBuffer(UnderlyingBuffer, {}) {} + + /// Return the wrapped \c StringRef that must be outlived by \c this. + const StringRef *operator->() const & { return &String; } + const StringRef &operator*() const & { return String; } + + /// Make it harder to get a \c StringRef that outlives \c this. + const StringRef *operator->() && = delete; + const StringRef &operator*() && = delete; + }; + public: - void ResolveImportedPath(ModuleFile &M, std::string &Filename); - static void ResolveImportedPath(std::string &Filename, StringRef Prefix); + /// Get the buffer for resolving paths. + SmallString<0> &getPathBuf() { return PathBuf; } + + /// Resolve \c Path in the context of module file \c M. The return value + /// must go out of scope before the next call to \c ResolveImportedPath. + static TemporarilyOwnedStringRef + ResolveImportedPath(SmallString<0> &Buf, StringRef Path, ModuleFile &ModF); + /// Resolve \c Path in the context of the \c Prefix directory. The return + /// value must go out of scope before the next call to \c ResolveImportedPath. + static TemporarilyOwnedStringRef + ResolveImportedPath(SmallString<0> &Buf, StringRef Path, StringRef Prefix); + + /// Resolve \c Path in the context of module file \c M. + static std::string ResolveImportedPathAndAllocate(SmallString<0> &Buf, + StringRef Path, + ModuleFile &ModF); + /// Resolve \c Path in the context of the \c Prefix directory. + static std::string ResolveImportedPathAndAllocate(SmallString<0> &Buf, + StringRef Path, + StringRef Prefix); /// Returns the first key declaration for the given declaration. This /// is one that is formerly-canonical (or still canonical) and whose module diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h index 0c8ac75fc40f4..d6090ba1a6c69 100644 --- a/clang/include/clang/Serialization/ASTRecordWriter.h +++ b/clang/include/clang/Serialization/ASTRecordWriter.h @@ -128,6 +128,8 @@ class ASTRecordWriter AddStmt(const_cast(S)); } + void writeAttr(const Attr *A) { AddAttr(A); } + /// Write an BTFTypeTagAttr object. void writeBTFTypeTagAttr(const BTFTypeTagAttr *A) { AddAttr(A); } diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td index 9a6b35c1b9f77..86d62b58cac0f 100644 --- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td +++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td @@ -1764,4 +1764,8 @@ def UncountedLocalVarsChecker : Checker<"UncountedLocalVarsChecker">, HelpText<"Check uncounted local variables.">, Documentation; +def UncheckedLocalVarsChecker : Checker<"UncheckedLocalVarsChecker">, + HelpText<"Check unchecked local variables.">, + Documentation; + } // end alpha.webkit diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 1c3f771f417cc..d248084666d1b 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -3552,7 +3552,8 @@ ASTContext::adjustType(QualType Orig, const auto *AT = dyn_cast(Orig); return getAttributedType(AT->getAttrKind(), adjustType(AT->getModifiedType(), Adjust), - adjustType(AT->getEquivalentType(), Adjust)); + adjustType(AT->getEquivalentType(), Adjust), + AT->getAttr()); } case Type::BTFTagAttributed: { @@ -5197,17 +5198,20 @@ QualType ASTContext::getUnresolvedUsingType( QualType ASTContext::getAttributedType(attr::Kind attrKind, QualType modifiedType, - QualType equivalentType) const { + QualType equivalentType, + const Attr *attr) const { llvm::FoldingSetNodeID id; - AttributedType::Profile(id, attrKind, modifiedType, equivalentType); + AttributedType::Profile(id, attrKind, modifiedType, equivalentType, attr); void *insertPos = nullptr; AttributedType *type = AttributedTypes.FindNodeOrInsertPos(id, insertPos); if (type) return QualType(type, 0); + assert(!attr || attr->getKind() == attrKind); + QualType canon = getCanonicalType(equivalentType); - type = new (*this, alignof(AttributedType)) - AttributedType(canon, attrKind, modifiedType, equivalentType); + type = new (*this, alignof(AttributedType)) + AttributedType(canon, attrKind, attr, modifiedType, equivalentType); Types.push_back(type); AttributedTypes.InsertNode(type, insertPos); @@ -5215,6 +5219,33 @@ QualType ASTContext::getAttributedType(attr::Kind attrKind, return QualType(type, 0); } +QualType ASTContext::getAttributedType(const Attr *attr, QualType modifiedType, + QualType equivalentType) const { + return getAttributedType(attr->getKind(), modifiedType, equivalentType, attr); +} + +QualType ASTContext::getAttributedType(NullabilityKind nullability, + QualType modifiedType, + QualType equivalentType) { + switch (nullability) { + case NullabilityKind::NonNull: + return getAttributedType(attr::TypeNonNull, modifiedType, equivalentType); + + case NullabilityKind::Nullable: + return getAttributedType(attr::TypeNullable, modifiedType, equivalentType); + + case NullabilityKind::NullableResult: + return getAttributedType(attr::TypeNullableResult, modifiedType, + equivalentType); + + case NullabilityKind::Unspecified: + return getAttributedType(attr::TypeNullUnspecified, modifiedType, + equivalentType); + } + + llvm_unreachable("Unknown nullability kind"); +} + QualType ASTContext::getBTFTagAttributedType(const BTFTypeTagAttr *BTFAttr, QualType Wrapped) const { llvm::FoldingSetNodeID ID; @@ -7537,8 +7568,8 @@ QualType ASTContext::getArrayDecayedType(QualType Ty) const { // int x[_Nullable] -> int * _Nullable if (auto Nullability = Ty->getNullability()) { - Result = const_cast(this)->getAttributedType( - AttributedType::getNullabilityAttrKind(*Nullability), Result, Result); + Result = const_cast(this)->getAttributedType(*Nullability, + Result, Result); } return Result; } @@ -13773,7 +13804,8 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X, return QualType(); // FIXME: It's inefficient to have to unify the modified types. return Ctx.getAttributedType(Kind, Ctx.getCommonSugaredType(MX, MY), - Ctx.getQualifiedType(Underlying)); + Ctx.getQualifiedType(Underlying), + AX->getAttr()); } case Type::BTFTagAttributed: { const auto *BX = cast(X); diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp index 15c3efe421271..4f677b60e60da 100644 --- a/clang/lib/AST/ASTDiagnostic.cpp +++ b/clang/lib/AST/ASTDiagnostic.cpp @@ -85,8 +85,7 @@ QualType clang::desugarForDiagnostic(ASTContext &Context, QualType QT, QualType SugarRT = FT->getReturnType(); QualType RT = desugarForDiagnostic(Context, SugarRT, DesugarReturn); if (auto nullability = AttributedType::stripOuterNullability(SugarRT)) { - RT = Context.getAttributedType( - AttributedType::getNullabilityAttrKind(*nullability), RT, RT); + RT = Context.getAttributedType(*nullability, RT, RT); } bool DesugarArgument = false; @@ -97,8 +96,7 @@ QualType clang::desugarForDiagnostic(ASTContext &Context, QualType QT, QualType PT = desugarForDiagnostic(Context, SugarPT, DesugarArgument); if (auto nullability = AttributedType::stripOuterNullability(SugarPT)) { - PT = Context.getAttributedType( - AttributedType::getNullabilityAttrKind(*nullability), PT, PT); + PT = Context.getAttributedType(*nullability, PT, PT); } Args.push_back(PT); } diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index e7a6509167f0a..6e31df691fa10 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1580,8 +1580,9 @@ ExpectedType ASTNodeImporter::VisitAttributedType(const AttributedType *T) { if (!ToEquivalentTypeOrErr) return ToEquivalentTypeOrErr.takeError(); - return Importer.getToContext().getAttributedType(T->getAttrKind(), - *ToModifiedTypeOrErr, *ToEquivalentTypeOrErr); + return Importer.getToContext().getAttributedType( + T->getAttrKind(), *ToModifiedTypeOrErr, *ToEquivalentTypeOrErr, + T->getAttr()); } ExpectedType diff --git a/clang/lib/AST/ByteCode/Boolean.h b/clang/lib/AST/ByteCode/Boolean.h index c568b557574e2..78d75e75c7531 100644 --- a/clang/lib/AST/ByteCode/Boolean.h +++ b/clang/lib/AST/ByteCode/Boolean.h @@ -81,6 +81,16 @@ class Boolean final { Boolean truncate(unsigned TruncBits) const { return *this; } + static Boolean bitcastFromMemory(const std::byte *Buff, unsigned BitWidth) { + // Boolean width is currently always 8 for all supported targets. If this + // changes we need to get the bool width from the target info. + assert(BitWidth == 8); + bool Val = static_cast(*Buff); + return Boolean(Val); + } + + void bitcastToMemory(std::byte *Buff) { std::memcpy(Buff, &V, sizeof(V)); } + void print(llvm::raw_ostream &OS) const { OS << (V ? "true" : "false"); } std::string toDiagnosticString(const ASTContext &Ctx) const { std::string NameStr; diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 59e09a44d747b..75f790d17033c 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -470,6 +470,9 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { return this->emitDecayPtr(*FromT, *ToT, CE); } + case CK_LValueToRValueBitCast: + return this->emitBuiltinBitCast(CE); + case CK_IntegralToBoolean: case CK_FixedPointToBoolean: case CK_BooleanToSignedIntegral: @@ -6426,6 +6429,66 @@ bool Compiler::emitDummyPtr(const DeclTy &D, const Expr *E) { return this->emitDecayPtr(PT_Ptr, PT, E); return false; } + return true; +} + +// This function is constexpr if and only if To, From, and the types of +// all subobjects of To and From are types T such that... +// (3.1) - is_union_v is false; +// (3.2) - is_pointer_v is false; +// (3.3) - is_member_pointer_v is false; +// (3.4) - is_volatile_v is false; and +// (3.5) - T has no non-static data members of reference type +template +bool Compiler::emitBuiltinBitCast(const CastExpr *E) { + const Expr *SubExpr = E->getSubExpr(); + QualType FromType = SubExpr->getType(); + QualType ToType = E->getType(); + std::optional ToT = classify(ToType); + + assert(!DiscardResult && "Implement DiscardResult mode for bitcasts."); + + if (ToType->isNullPtrType()) { + if (!this->discard(SubExpr)) + return false; + + return this->emitNullPtr(nullptr, E); + } + + if (FromType->isNullPtrType() && ToT) { + if (!this->discard(SubExpr)) + return false; + + return visitZeroInitializer(*ToT, ToType, E); + } + assert(!ToType->isReferenceType()); + + // Get a pointer to the value-to-cast on the stack. + if (!this->visit(SubExpr)) + return false; + + if (!ToT || ToT == PT_Ptr) { + // Conversion to an array or record type. + assert(false && "Implement bitcast to pointers."); + } + assert(ToT); + + const llvm::fltSemantics *TargetSemantics = nullptr; + if (ToT == PT_Float) + TargetSemantics = &Ctx.getFloatSemantics(ToType); + + // Conversion to a primitive type. FromType can be another + // primitive type, or a record/array. + bool ToTypeIsUChar = (ToType->isSpecificBuiltinType(BuiltinType::UChar) || + ToType->isSpecificBuiltinType(BuiltinType::Char_U)); + uint32_t ResultBitWidth = std::max(Ctx.getBitWidth(ToType), 8u); + + if (!this->emitBitCast(*ToT, ToTypeIsUChar || ToType->isStdByteType(), + ResultBitWidth, TargetSemantics, E)) + return false; + + if (DiscardResult) + return this->emitPop(*ToT, E); return true; } diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index 5627d5071e810..d1b624daba6b9 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -374,6 +374,7 @@ class Compiler : public ConstStmtVisitor, bool>, unsigned collectBaseOffset(const QualType BaseType, const QualType DerivedType); bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD); + bool emitBuiltinBitCast(const CastExpr *E); bool compileConstructor(const CXXConstructorDecl *Ctor); bool compileDestructor(const CXXDestructorDecl *Dtor); diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp index 5a8a2b64d5582..39a54e4902cd2 100644 --- a/clang/lib/AST/ByteCode/Descriptor.cpp +++ b/clang/lib/AST/ByteCode/Descriptor.cpp @@ -411,8 +411,16 @@ QualType Descriptor::getElemQualType() const { QualType T = getType(); if (T->isPointerOrReferenceType()) return T->getPointeeType(); - if (const auto *AT = T->getAsArrayTypeUnsafe()) + if (const auto *AT = T->getAsArrayTypeUnsafe()) { + // For primitive arrays, we don't save a QualType at all, + // just a PrimType. Try to figure out the QualType here. + if (isPrimitiveArray()) { + while (T->isArrayType()) + T = T->getAsArrayTypeUnsafe()->getElementType(); + return T; + } return AT->getElementType(); + } if (const auto *CT = T->getAs()) return CT->getElementType(); if (const auto *CT = T->getAs()) diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h index 114487821880f..be38e6991dad7 100644 --- a/clang/lib/AST/ByteCode/Floating.h +++ b/clang/lib/AST/ByteCode/Floating.h @@ -135,6 +135,11 @@ class Floating final { return Floating(APFloat(Sem, API)); } + void bitcastToMemory(std::byte *Buff) { + llvm::APInt API = F.bitcastToAPInt(); + llvm::StoreIntToMemory(API, (uint8_t *)Buff, bitWidth() / 8); + } + // === Serialization support === size_t bytesToSerialize() const { return sizeof(llvm::fltSemantics *) + diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h index be537d22d5af1..ca3674263aef4 100644 --- a/clang/lib/AST/ByteCode/Integral.h +++ b/clang/lib/AST/ByteCode/Integral.h @@ -70,6 +70,7 @@ template class Integral final { // The primitive representing the integral. using ReprT = typename Repr::Type; ReprT V; + static_assert(std::is_trivially_copyable_v); /// Primitive representing limits. static const auto Min = std::numeric_limits::min(); @@ -154,6 +155,18 @@ template class Integral final { return Compare(V, RHS.V); } + void bitcastToMemory(std::byte *Dest) const { + std::memcpy(Dest, &V, sizeof(V)); + } + + static Integral bitcastFromMemory(const std::byte *Src, unsigned BitWidth) { + assert(BitWidth == sizeof(ReprT) * 8); + ReprT V; + + std::memcpy(&V, Src, sizeof(ReprT)); + return Integral(V); + } + std::string toDiagnosticString(const ASTContext &Ctx) const { std::string NameStr; llvm::raw_string_ostream OS(NameStr); diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h index f8aeaaca398fe..8ee08dfb5cfe7 100644 --- a/clang/lib/AST/ByteCode/IntegralAP.h +++ b/clang/lib/AST/ByteCode/IntegralAP.h @@ -171,6 +171,16 @@ template class IntegralAP final { return IntegralAP(Copy); } + void bitcastToMemory(std::byte *Dest) const { + llvm::StoreIntToMemory(V, (uint8_t *)Dest, bitWidth() / 8); + } + + static IntegralAP bitcastFromMemory(const std::byte *Src, unsigned BitWidth) { + APInt V(BitWidth, static_cast(0), Signed); + llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8); + return IntegralAP(V); + } + ComparisonCategoryResult compare(const IntegralAP &RHS) const { assert(Signed == RHS.isSigned()); assert(bitWidth() == RHS.bitWidth()); diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 513d4512b45cf..0e571624ae18d 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1002,6 +1002,13 @@ static bool RunDestructors(InterpState &S, CodePtr OpPC, const Block *B) { return runRecordDestructor(S, OpPC, Pointer(const_cast(B)), Desc); } +static bool hasVirtualDestructor(QualType T) { + if (const CXXRecordDecl *RD = T->getAsCXXRecordDecl()) + if (const CXXDestructorDecl *DD = RD->getDestructor()) + return DD->isVirtual(); + return false; +} + bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm, bool IsGlobalDelete) { if (!CheckDynamicMemoryAllocation(S, OpPC)) @@ -1019,9 +1026,20 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm, return true; // Remove base casts. + QualType InitialType = Ptr.getType(); while (Ptr.isBaseClass()) Ptr = Ptr.getBase(); + // For the non-array case, the types must match if the static type + // does not have a virtual destructor. + if (!DeleteIsArrayForm && Ptr.getType() != InitialType && + !hasVirtualDestructor(InitialType)) { + S.FFDiag(S.Current->getSource(OpPC), + diag::note_constexpr_delete_base_nonvirt_dtor) + << InitialType << Ptr.getType(); + return false; + } + if (!Ptr.isRoot() || Ptr.isOnePastEnd() || Ptr.isArrayElement()) { const SourceInfo &Loc = S.Current->getSource(OpPC); S.FFDiag(Loc, diag::note_constexpr_delete_subobject) @@ -1556,6 +1574,23 @@ bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) { return true; } +bool CheckBitCast(InterpState &S, CodePtr OpPC, bool HasIndeterminateBits, + bool TargetIsUCharOrByte) { + // This is always fine. + if (!HasIndeterminateBits) + return true; + + // Indeterminate bits can only be bitcast to unsigned char or std::byte. + if (TargetIsUCharOrByte) + return true; + + const Expr *E = S.Current->getExpr(OpPC); + QualType ExprType = E->getType(); + S.FFDiag(E, diag::note_constexpr_bit_cast_indet_dest) + << ExprType << S.getLangOpts().CharIsSigned << E->getSourceRange(); + return false; +} + // https://github.com/llvm/llvm-project/issues/102513 #if defined(_WIN32) && !defined(__clang__) && !defined(NDEBUG) #pragma optimize("", off) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index c95b18ef72c96..52d3a19ca9959 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -20,6 +20,7 @@ #include "Floating.h" #include "Function.h" #include "FunctionPointer.h" +#include "InterpBuiltinBitCast.h" #include "InterpFrame.h" #include "InterpStack.h" #include "InterpState.h" @@ -162,6 +163,8 @@ bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize, const CallExpr *CE); bool CheckLiteralType(InterpState &S, CodePtr OpPC, const Type *T); bool InvalidShuffleVectorIndex(InterpState &S, CodePtr OpPC, uint32_t Index); +bool CheckBitCast(InterpState &S, CodePtr OpPC, bool HasIndeterminateBits, + bool TargetIsUCharOrByte); template static bool handleOverflow(InterpState &S, CodePtr OpPC, const T &SrcValue) { @@ -273,8 +276,14 @@ bool CheckArraySize(InterpState &S, CodePtr OpPC, SizeT *NumElements, *NumElements > MaxElements) { if (!IsNoThrow) { const SourceInfo &Loc = S.Current->getSource(OpPC); - S.FFDiag(Loc, diag::note_constexpr_new_too_large) - << NumElements->toDiagnosticString(S.getASTContext()); + + if (NumElements->isSigned() && NumElements->isNegative()) { + S.FFDiag(Loc, diag::note_constexpr_new_negative) + << NumElements->toDiagnosticString(S.getASTContext()); + } else { + S.FFDiag(Loc, diag::note_constexpr_new_too_large) + << NumElements->toDiagnosticString(S.getASTContext()); + } } return false; } @@ -3033,6 +3042,35 @@ bool CheckNewTypeMismatchArray(InterpState &S, CodePtr OpPC, const Expr *E) { return CheckNewTypeMismatch(S, OpPC, E, static_cast(Size)); } bool InvalidNewDeleteExpr(InterpState &S, CodePtr OpPC, const Expr *E); + +template ::T> +inline bool BitCast(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte, + uint32_t ResultBitWidth, const llvm::fltSemantics *Sem) { + const Pointer &FromPtr = S.Stk.pop(); + + if (!CheckLoad(S, OpPC, FromPtr)) + return false; + + size_t BuffSize = ResultBitWidth / 8; + llvm::SmallVector Buff(BuffSize); + bool HasIndeterminateBits = false; + + if (!DoBitCast(S, OpPC, FromPtr, Buff.data(), BuffSize, HasIndeterminateBits)) + return false; + + if (!CheckBitCast(S, OpPC, HasIndeterminateBits, TargetIsUCharOrByte)) + return false; + + if constexpr (std::is_same_v) { + assert(Sem); + S.Stk.push(T::bitcastFromMemory(Buff.data(), *Sem)); + } else { + assert(!Sem); + S.Stk.push(T::bitcastFromMemory(Buff.data(), ResultBitWidth)); + } + return true; +} + //===----------------------------------------------------------------------===// // Read opcode arguments //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index b00d2a1768b6b..144f2291651cc 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -10,6 +10,7 @@ #include "Compiler.h" #include "EvalEmitter.h" #include "Interp.h" +#include "InterpBuiltinBitCast.h" #include "PrimType.h" #include "clang/AST/OSLog.h" #include "clang/AST/RecordLayout.h" @@ -1253,7 +1254,7 @@ static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, const CallExpr *Call) { - if (!Call->getArg(0)->getType()->isIntegerType() || + if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() || !Call->getArg(1)->getType()->isIntegerType()) return false; @@ -1285,7 +1286,9 @@ static bool interp__builtin_ia32_bzhi(InterpState &S, CodePtr OpPC, const Function *Func, const CallExpr *Call) { QualType CallType = Call->getType(); - if (!CallType->isIntegerType()) + if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() || + !Call->getArg(1)->getType()->isIntegerType() || + !CallType->isIntegerType()) return false; PrimType ValT = *S.Ctx.classify(Call->getArg(0)); @@ -1310,7 +1313,8 @@ static bool interp__builtin_ia32_lzcnt(InterpState &S, CodePtr OpPC, const Function *Func, const CallExpr *Call) { QualType CallType = Call->getType(); - if (!CallType->isIntegerType()) + if (!CallType->isIntegerType() || + !Call->getArg(0)->getType()->isIntegerType()) return false; APSInt Val = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(0))); @@ -1323,7 +1327,8 @@ static bool interp__builtin_ia32_tzcnt(InterpState &S, CodePtr OpPC, const Function *Func, const CallExpr *Call) { QualType CallType = Call->getType(); - if (!CallType->isIntegerType()) + if (!CallType->isIntegerType() || + !Call->getArg(0)->getType()->isIntegerType()) return false; APSInt Val = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(0))); @@ -1335,7 +1340,7 @@ static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, const CallExpr *Call) { - if (!Call->getArg(0)->getType()->isIntegerType() || + if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() || !Call->getArg(1)->getType()->isIntegerType()) return false; @@ -1360,7 +1365,7 @@ static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, const CallExpr *Call) { - if (!Call->getArg(0)->getType()->isIntegerType() || + if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() || !Call->getArg(1)->getType()->isIntegerType()) return false; diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp new file mode 100644 index 0000000000000..fde2c6d9b11ac --- /dev/null +++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp @@ -0,0 +1,367 @@ +//===-------------------- InterpBuiltinBitCast.cpp --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "InterpBuiltinBitCast.h" +#include "Boolean.h" +#include "Context.h" +#include "FixedPoint.h" +#include "Floating.h" +#include "Integral.h" +#include "IntegralAP.h" +#include "InterpState.h" +#include "MemberPointer.h" +#include "Pointer.h" +#include "Record.h" +#include "clang/AST/ASTContext.h" +#include "clang/AST/RecordLayout.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/ADT/BitVector.h" +#include + +using namespace clang; +using namespace clang::interp; + +/// Used to iterate over pointer fields. +using DataFunc = + llvm::function_ref; + +#define BITCAST_TYPE_SWITCH(Expr, B) \ + do { \ + switch (Expr) { \ + TYPE_SWITCH_CASE(PT_Sint8, B) \ + TYPE_SWITCH_CASE(PT_Uint8, B) \ + TYPE_SWITCH_CASE(PT_Sint16, B) \ + TYPE_SWITCH_CASE(PT_Uint16, B) \ + TYPE_SWITCH_CASE(PT_Sint32, B) \ + TYPE_SWITCH_CASE(PT_Uint32, B) \ + TYPE_SWITCH_CASE(PT_Sint64, B) \ + TYPE_SWITCH_CASE(PT_Uint64, B) \ + TYPE_SWITCH_CASE(PT_IntAP, B) \ + TYPE_SWITCH_CASE(PT_IntAPS, B) \ + TYPE_SWITCH_CASE(PT_Bool, B) \ + default: \ + llvm_unreachable("Unhandled bitcast type"); \ + } \ + } while (0) + +/// Float is a special case that sometimes needs the floating point semantics +/// to be available. +#define BITCAST_TYPE_SWITCH_WITH_FLOAT(Expr, B) \ + do { \ + switch (Expr) { \ + TYPE_SWITCH_CASE(PT_Sint8, B) \ + TYPE_SWITCH_CASE(PT_Uint8, B) \ + TYPE_SWITCH_CASE(PT_Sint16, B) \ + TYPE_SWITCH_CASE(PT_Uint16, B) \ + TYPE_SWITCH_CASE(PT_Sint32, B) \ + TYPE_SWITCH_CASE(PT_Uint32, B) \ + TYPE_SWITCH_CASE(PT_Sint64, B) \ + TYPE_SWITCH_CASE(PT_Uint64, B) \ + TYPE_SWITCH_CASE(PT_IntAP, B) \ + TYPE_SWITCH_CASE(PT_IntAPS, B) \ + TYPE_SWITCH_CASE(PT_Bool, B) \ + TYPE_SWITCH_CASE(PT_Float, B) \ + default: \ + llvm_unreachable("Unhandled bitcast type"); \ + } \ + } while (0) + +static bool bitof(std::byte B, unsigned BitIndex) { + return (B & (std::byte{1} << BitIndex)) != std::byte{0}; +} + +static void swapBytes(std::byte *M, size_t N) { + for (size_t I = 0; I != (N / 2); ++I) + std::swap(M[I], M[N - 1 - I]); +} + +/// Track what bits have been initialized to known values and which ones +/// have indeterminate value. +/// All offsets are in bits. +struct BitcastBuffer { + llvm::BitVector Data; + + BitcastBuffer() = default; + + size_t size() const { return Data.size(); } + + const std::byte *data() const { + unsigned NBytes = Data.size() / 8; + unsigned BitVectorWordSize = sizeof(uintptr_t); + bool FullWord = (NBytes % BitVectorWordSize == 0); + + // llvm::BitVector uses 64-bit fields internally, so when we have + // fewer bytes than that, we need to compensate for that on + // big endian hosts. + unsigned DataPlus; + if (llvm::sys::IsBigEndianHost) + DataPlus = BitVectorWordSize - (NBytes % BitVectorWordSize); + else + DataPlus = 0; + + return reinterpret_cast(Data.getData().data()) + + (FullWord ? 0 : DataPlus); + } + + bool allInitialized() const { + // FIXME: Implement. + return true; + } + + void pushData(const std::byte *data, size_t BitOffset, size_t BitWidth, + bool BigEndianTarget) { + Data.reserve(BitOffset + BitWidth); + + bool OnlyFullBytes = BitWidth % 8 == 0; + unsigned NBytes = BitWidth / 8; + + size_t BitsHandled = 0; + // Read all full bytes first + for (size_t I = 0; I != NBytes; ++I) { + std::byte B = + BigEndianTarget ? data[NBytes - OnlyFullBytes - I] : data[I]; + for (unsigned X = 0; X != 8; ++X) { + Data.push_back(bitof(B, X)); + ++BitsHandled; + } + } + + if (BitsHandled == BitWidth) + return; + + // Rest of the bits. + assert((BitWidth - BitsHandled) < 8); + std::byte B = BigEndianTarget ? data[0] : data[NBytes]; + for (size_t I = 0, E = (BitWidth - BitsHandled); I != E; ++I) { + Data.push_back(bitof(B, I)); + ++BitsHandled; + } + + assert(BitsHandled == BitWidth); + } +}; + +/// We use this to recursively iterate over all fields and elemends of a pointer +/// and extract relevant data for a bitcast. +static bool enumerateData(const Pointer &P, const Context &Ctx, size_t Offset, + DataFunc F) { + const Descriptor *FieldDesc = P.getFieldDesc(); + assert(FieldDesc); + + // Primitives. + if (FieldDesc->isPrimitive()) + return F(P, FieldDesc->getPrimType(), Offset); + + // Primitive arrays. + if (FieldDesc->isPrimitiveArray()) { + bool BigEndianTarget = Ctx.getASTContext().getTargetInfo().isBigEndian(); + QualType ElemType = FieldDesc->getElemQualType(); + size_t ElemSizeInBits = Ctx.getASTContext().getTypeSize(ElemType); + PrimType ElemT = *Ctx.classify(ElemType); + bool Ok = true; + for (unsigned I = 0; I != FieldDesc->getNumElems(); ++I) { + unsigned Index = BigEndianTarget ? (FieldDesc->getNumElems() - 1 - I) : I; + Ok = Ok && F(P.atIndex(Index), ElemT, Offset); + Offset += ElemSizeInBits; + } + return Ok; + } + + // Composite arrays. + if (FieldDesc->isCompositeArray()) { + bool BigEndianTarget = Ctx.getASTContext().getTargetInfo().isBigEndian(); + QualType ElemType = FieldDesc->getElemQualType(); + size_t ElemSizeInBits = Ctx.getASTContext().getTypeSize(ElemType); + for (unsigned I = 0; I != FieldDesc->getNumElems(); ++I) { + unsigned Index = BigEndianTarget ? (FieldDesc->getNumElems() - 1 - I) : I; + enumerateData(P.atIndex(Index).narrow(), Ctx, Offset, F); + Offset += ElemSizeInBits; + } + return true; + } + + // Records. + if (FieldDesc->isRecord()) { + bool BigEndianTarget = Ctx.getASTContext().getTargetInfo().isBigEndian(); + const Record *R = FieldDesc->ElemRecord; + const ASTRecordLayout &Layout = + Ctx.getASTContext().getASTRecordLayout(R->getDecl()); + bool Ok = true; + + auto enumerateFields = [&]() -> void { + for (unsigned I = 0, N = R->getNumFields(); I != N; ++I) { + const Record::Field *Fi = + R->getField(BigEndianTarget ? (N - 1 - I) : I); + Pointer Elem = P.atField(Fi->Offset); + size_t BitOffset = + Offset + Layout.getFieldOffset(Fi->Decl->getFieldIndex()); + Ok = Ok && enumerateData(Elem, Ctx, BitOffset, F); + } + }; + auto enumerateBases = [&]() -> void { + for (unsigned I = 0, N = R->getNumBases(); I != N; ++I) { + const Record::Base *B = R->getBase(BigEndianTarget ? (N - 1 - I) : I); + Pointer Elem = P.atField(B->Offset); + CharUnits ByteOffset = + Layout.getBaseClassOffset(cast(B->Decl)); + size_t BitOffset = Offset + Ctx.getASTContext().toBits(ByteOffset); + Ok = Ok && enumerateData(Elem, Ctx, BitOffset, F); + } + }; + + if (BigEndianTarget) { + enumerateFields(); + enumerateBases(); + } else { + enumerateBases(); + enumerateFields(); + } + + return Ok; + } + + llvm_unreachable("Unhandled data type"); +} + +static bool enumeratePointerFields(const Pointer &P, const Context &Ctx, + DataFunc F) { + return enumerateData(P, Ctx, 0, F); +} + +// This function is constexpr if and only if To, From, and the types of +// all subobjects of To and From are types T such that... +// (3.1) - is_union_v is false; +// (3.2) - is_pointer_v is false; +// (3.3) - is_member_pointer_v is false; +// (3.4) - is_volatile_v is false; and +// (3.5) - T has no non-static data members of reference type +// +// NOTE: This is a version of checkBitCastConstexprEligibilityType() in +// ExprConstant.cpp. +static bool CheckBitcastType(InterpState &S, CodePtr OpPC, QualType T, + bool IsToType) { + enum { + E_Union = 0, + E_Pointer, + E_MemberPointer, + E_Volatile, + E_Reference, + }; + enum { C_Member, C_Base }; + + auto diag = [&](int Reason) -> bool { + const Expr *E = S.Current->getExpr(OpPC); + S.FFDiag(E, diag::note_constexpr_bit_cast_invalid_type) + << static_cast(IsToType) << (Reason == E_Reference) << Reason + << E->getSourceRange(); + return false; + }; + auto note = [&](int Construct, QualType NoteType, SourceRange NoteRange) { + S.Note(NoteRange.getBegin(), diag::note_constexpr_bit_cast_invalid_subtype) + << NoteType << Construct << T << NoteRange; + return false; + }; + + T = T.getCanonicalType(); + + if (T->isUnionType()) + return diag(E_Union); + if (T->isPointerType()) + return diag(E_Pointer); + if (T->isMemberPointerType()) + return diag(E_MemberPointer); + if (T.isVolatileQualified()) + return diag(E_Volatile); + + if (const RecordDecl *RD = T->getAsRecordDecl()) { + if (const auto *CXXRD = dyn_cast(RD)) { + for (const CXXBaseSpecifier &BS : CXXRD->bases()) { + if (!CheckBitcastType(S, OpPC, BS.getType(), IsToType)) + return note(C_Base, BS.getType(), BS.getBeginLoc()); + } + } + for (const FieldDecl *FD : RD->fields()) { + if (FD->getType()->isReferenceType()) + return diag(E_Reference); + if (!CheckBitcastType(S, OpPC, FD->getType(), IsToType)) + return note(C_Member, FD->getType(), FD->getSourceRange()); + } + } + + if (T->isArrayType() && + !CheckBitcastType(S, OpPC, S.getASTContext().getBaseElementType(T), + IsToType)) + return false; + + return true; +} + +static bool readPointerToBuffer(const Context &Ctx, const Pointer &FromPtr, + BitcastBuffer &Buffer, bool ReturnOnUninit) { + const ASTContext &ASTCtx = Ctx.getASTContext(); + bool SwapData = (ASTCtx.getTargetInfo().isLittleEndian() != + llvm::sys::IsLittleEndianHost); + bool BigEndianTarget = ASTCtx.getTargetInfo().isBigEndian(); + + return enumeratePointerFields( + FromPtr, Ctx, + [&](const Pointer &P, PrimType T, size_t BitOffset) -> bool { + if (!P.isInitialized()) { + assert(false && "Implement uninitialized value tracking"); + return ReturnOnUninit; + } + + assert(P.isInitialized()); + // nullptr_t is a PT_Ptr for us, but it's still not std::is_pointer_v. + if (T == PT_Ptr) + assert(false && "Implement casting to pointer types"); + + CharUnits ObjectReprChars = ASTCtx.getTypeSizeInChars(P.getType()); + unsigned BitWidth; + if (const FieldDecl *FD = P.getField(); FD && FD->isBitField()) + BitWidth = FD->getBitWidthValue(ASTCtx); + else + BitWidth = ASTCtx.toBits(ObjectReprChars); + + llvm::SmallVector Buff(ObjectReprChars.getQuantity()); + BITCAST_TYPE_SWITCH_WITH_FLOAT(T, { + T Val = P.deref(); + Val.bitcastToMemory(Buff.data()); + }); + if (SwapData) + swapBytes(Buff.data(), ObjectReprChars.getQuantity()); + + if (BitWidth != (Buff.size() * 8) && BigEndianTarget) { + Buffer.pushData(Buff.data() + (Buff.size() - 1 - (BitWidth / 8)), + BitOffset, BitWidth, BigEndianTarget); + } else { + Buffer.pushData(Buff.data(), BitOffset, BitWidth, BigEndianTarget); + } + return true; + }); +} + +bool clang::interp::DoBitCast(InterpState &S, CodePtr OpPC, const Pointer &Ptr, + std::byte *Buff, size_t BuffSize, + bool &HasIndeterminateBits) { + assert(Ptr.isLive()); + assert(Ptr.isBlockPointer()); + assert(Buff); + + BitcastBuffer Buffer; + if (!CheckBitcastType(S, OpPC, Ptr.getType(), /*IsToType=*/false)) + return false; + + bool Success = readPointerToBuffer(S.getContext(), Ptr, Buffer, + /*ReturnOnUninit=*/false); + assert(Buffer.size() == BuffSize * 8); + + HasIndeterminateBits = !Buffer.allInitialized(); + std::memcpy(Buff, Buffer.data(), BuffSize); + + return Success; +} diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h new file mode 100644 index 0000000000000..84ba784e95e23 --- /dev/null +++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h @@ -0,0 +1,26 @@ +//===------------------ InterpBuiltinBitCast.h ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_AST_INTERP_BUILITN_BIT_CAST_H +#define LLVM_CLANG_AST_INTERP_BUILITN_BIT_CAST_H + +#include + +namespace clang { +namespace interp { +class Pointer; +class InterpState; +class CodePtr; + +bool DoBitCast(InterpState &S, CodePtr OpPC, const Pointer &Ptr, + std::byte *Buff, size_t BuffSize, bool &HasIndeterminateBits); + +} // namespace interp +} // namespace clang + +#endif diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 9136e6b51660d..480febd895a24 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -837,3 +837,13 @@ def CheckNewTypeMismatchArray : Opcode { def IsConstantContext: Opcode; def CheckAllocations : Opcode; + +def BitCastTypeClass : TypeClass { + let Types = [Uint8, Sint8, Uint16, Sint16, Uint32, Sint32, Uint64, Sint64, IntAP, IntAPS, Bool, Float]; +} + +def BitCast : Opcode { + let Types = [BitCastTypeClass]; + let Args = [ArgBool, ArgUint32, ArgFltSemantics]; + let HasGroup = 1; +} diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp index c9de039c195d9..54484853fcdae 100644 --- a/clang/lib/AST/ByteCode/Pointer.cpp +++ b/clang/lib/AST/ByteCode/Pointer.cpp @@ -200,15 +200,26 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const { // Build the path into the object. Pointer Ptr = *this; while (Ptr.isField() || Ptr.isArrayElement()) { + if (Ptr.isArrayRoot()) { - Path.push_back(APValue::LValuePathEntry( - {Ptr.getFieldDesc()->asDecl(), /*IsVirtual=*/false})); + // An array root may still be an array element itself. + if (Ptr.isArrayElement()) { + Ptr = Ptr.expand(); + unsigned Index = Ptr.getIndex(); + Path.push_back(APValue::LValuePathEntry::ArrayIndex(Index)); + QualType ElemType = Ptr.getFieldDesc()->getElemQualType(); + Offset += (Index * ASTCtx.getTypeSizeInChars(ElemType)); + Ptr = Ptr.getArray(); + } else { + Path.push_back(APValue::LValuePathEntry( + {Ptr.getFieldDesc()->asDecl(), /*IsVirtual=*/false})); - if (const auto *FD = - dyn_cast_if_present(Ptr.getFieldDesc()->asDecl())) - Offset += getFieldOffset(FD); + if (const auto *FD = + dyn_cast_if_present(Ptr.getFieldDesc()->asDecl())) + Offset += getFieldOffset(FD); - Ptr = Ptr.getBase(); + Ptr = Ptr.getBase(); + } } else if (Ptr.isArrayElement()) { Ptr = Ptr.expand(); unsigned Index; @@ -219,7 +230,6 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const { QualType ElemType = Ptr.getFieldDesc()->getElemQualType(); Offset += (Index * ASTCtx.getTypeSizeInChars(ElemType)); - Path.push_back(APValue::LValuePathEntry::ArrayIndex(Index)); Ptr = Ptr.getArray(); } else { diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp index 0da518ec92afa..04e0b1884e3d6 100644 --- a/clang/lib/AST/ByteCode/Program.cpp +++ b/clang/lib/AST/ByteCode/Program.cpp @@ -399,10 +399,10 @@ Descriptor *Program::createDescriptor(const DeclTy &D, const Type *Ty, } // Arrays. - if (const auto ArrayType = Ty->getAsArrayTypeUnsafe()) { + if (const auto *ArrayType = Ty->getAsArrayTypeUnsafe()) { QualType ElemTy = ArrayType->getElementType(); // Array of well-known bounds. - if (auto CAT = dyn_cast(ArrayType)) { + if (const auto *CAT = dyn_cast(ArrayType)) { size_t NumElems = CAT->getZExtSize(); if (std::optional T = Ctx.classify(ElemTy)) { // Arrays of primitives. diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt index 6195a16c2c68d..038856248c160 100644 --- a/clang/lib/AST/CMakeLists.txt +++ b/clang/lib/AST/CMakeLists.txt @@ -74,6 +74,7 @@ add_clang_library(clangAST ByteCode/Function.cpp ByteCode/FunctionPointer.cpp ByteCode/InterpBuiltin.cpp + ByteCode/InterpBuiltinBitCast.cpp ByteCode/Floating.cpp ByteCode/EvaluationResult.cpp ByteCode/DynamicAllocator.cpp diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index b3e46508cf596..14bc260d0245f 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -468,6 +468,7 @@ class CXXNameMangler { void mangleLambdaSig(const CXXRecordDecl *Lambda); void mangleModuleNamePrefix(StringRef Name, bool IsPartition = false); void mangleVendorQualifier(StringRef Name); + void mangleVendorType(StringRef Name); private: @@ -2891,6 +2892,10 @@ void CXXNameMangler::mangleVendorQualifier(StringRef name) { Out << 'U' << name.size() << name; } +void CXXNameMangler::mangleVendorType(StringRef name) { + Out << 'u' << name.size() << name; +} + void CXXNameMangler::mangleRefQualifier(RefQualifierKind RefQualifier) { // ::= R # lvalue reference // ::= O # rvalue-reference @@ -3413,8 +3418,7 @@ void CXXNameMangler::mangleType(const BuiltinType *T) { if (T->getKind() == BuiltinType::SveBFloat16 && \ isCompatibleWith(LangOptions::ClangABI::Ver17)) { \ /* Prior to Clang 18.0 we used this incorrect mangled name */ \ - type_name = "__SVBFloat16_t"; \ - Out << "u" << type_name.size() << type_name; \ + mangleVendorType("__SVBFloat16_t"); \ } else { \ type_name = MangledName; \ Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ @@ -3436,35 +3440,30 @@ void CXXNameMangler::mangleType(const BuiltinType *T) { Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ break; #include "clang/Basic/AArch64SVEACLETypes.def" -#define PPC_VECTOR_TYPE(Name, Id, Size) \ - case BuiltinType::Id: \ - type_name = #Name; \ - Out << 'u' << type_name.size() << type_name; \ +#define PPC_VECTOR_TYPE(Name, Id, Size) \ + case BuiltinType::Id: \ + mangleVendorType(#Name); \ break; #include "clang/Basic/PPCTypes.def" // TODO: Check the mangling scheme for RISC-V V. #define RVV_TYPE(Name, Id, SingletonId) \ case BuiltinType::Id: \ - type_name = Name; \ - Out << 'u' << type_name.size() << type_name; \ + mangleVendorType(Name); \ break; #include "clang/Basic/RISCVVTypes.def" #define WASM_REF_TYPE(InternalName, MangledName, Id, SingletonId, AS) \ case BuiltinType::Id: \ - type_name = MangledName; \ - Out << 'u' << type_name.size() << type_name; \ + mangleVendorType(MangledName); \ break; #include "clang/Basic/WebAssemblyReferenceTypes.def" #define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) \ case BuiltinType::Id: \ - type_name = Name; \ - Out << 'u' << type_name.size() << type_name; \ + mangleVendorType(Name); \ break; #include "clang/Basic/AMDGPUTypes.def" #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) \ case BuiltinType::Id: \ - type_name = #Name; \ - Out << 'u' << type_name.size() << type_name; \ + mangleVendorType(#Name); \ break; #include "clang/Basic/HLSLIntangibleTypes.def" } @@ -4035,8 +4034,9 @@ void CXXNameMangler::mangleAArch64FixedSveVectorType(const VectorType *T) { if (T->getVectorKind() == VectorKind::SveFixedLengthPredicate) VecSizeInBits *= 8; - Out << "9__SVE_VLSI" << 'u' << TypeName.size() << TypeName << "Lj" - << VecSizeInBits << "EE"; + Out << "9__SVE_VLSI"; + mangleVendorType(TypeName); + Out << "Lj" << VecSizeInBits << "EE"; } void CXXNameMangler::mangleAArch64FixedSveVectorType( @@ -4136,8 +4136,9 @@ void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) { } TypeNameOS << "_t"; - Out << "9__RVV_VLSI" << 'u' << TypeNameStr.size() << TypeNameStr << "Lj" - << VecSizeInBits << "EE"; + Out << "9__RVV_VLSI"; + mangleVendorType(TypeNameStr); + Out << "Lj" << VecSizeInBits << "EE"; } void CXXNameMangler::mangleRISCVFixedRVVVectorType( @@ -4236,8 +4237,7 @@ void CXXNameMangler::mangleType(const ConstantMatrixType *T) { // Mangle matrix types as a vendor extended type: // umatrix_typeIE - StringRef VendorQualifier = "matrix_type"; - Out << "u" << VendorQualifier.size() << VendorQualifier; + mangleVendorType("matrix_type"); Out << "I"; auto &ASTCtx = getASTContext(); @@ -4255,8 +4255,7 @@ void CXXNameMangler::mangleType(const ConstantMatrixType *T) { void CXXNameMangler::mangleType(const DependentSizedMatrixType *T) { // Mangle matrix types as a vendor extended type: // umatrix_typeIE - StringRef VendorQualifier = "matrix_type"; - Out << "u" << VendorQualifier.size() << VendorQualifier; + mangleVendorType("matrix_type"); Out << "I"; mangleTemplateArgExpr(T->getRowExpr()); @@ -4302,7 +4301,7 @@ void CXXNameMangler::mangleType(const ObjCObjectType *T) { StringRef name = I->getName(); QualOS << name.size() << name; } - Out << 'U' << QualStr.size() << QualStr; + mangleVendorQualifier(QualStr); } mangleType(T->getBaseType()); @@ -4436,8 +4435,6 @@ void CXXNameMangler::mangleType(const UnaryTransformType *T) { // If this is dependent, we need to record that. If not, we simply // mangle it as the underlying type since they are equivalent. if (T->isDependentType()) { - Out << "u"; - StringRef BuiltinName; switch (T->getUTTKind()) { #define TRANSFORM_TYPE_TRAIT_DEF(Enum, Trait) \ @@ -4446,7 +4443,7 @@ void CXXNameMangler::mangleType(const UnaryTransformType *T) { break; #include "clang/Basic/TransformTypeTraits.def" } - Out << BuiltinName.size() << BuiltinName; + mangleVendorType(BuiltinName); } Out << "I"; @@ -5311,9 +5308,8 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity, // ::= u * E # vendor extension const TypeTraitExpr *TTE = cast(E); NotPrimaryExpr(); - Out << 'u'; llvm::StringRef Spelling = getTraitSpelling(TTE->getTrait()); - Out << Spelling.size() << Spelling; + mangleVendorType(Spelling); for (TypeSourceInfo *TSI : TTE->getArgs()) { mangleType(TSI->getType()); } diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 113d4a100528f..229721aeae811 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -1241,8 +1241,8 @@ struct SimpleTransformVisitor : public TypeVisitor { == T->getEquivalentType().getAsOpaquePtr()) return QualType(T, 0); - return Ctx.getAttributedType(T->getAttrKind(), modifiedType, - equivalentType); + return Ctx.getAttributedType(T->getAttrKind(), modifiedType, equivalentType, + T->getAttr()); } QualType VisitSubstTemplateTypeParmType(const SubstTemplateTypeParmType *T) { @@ -1545,7 +1545,8 @@ struct SubstObjCTypeArgsVisitor // Rebuild the attributed type. return Ctx.getAttributedType(newAttrType->getAttrKind(), - newAttrType->getModifiedType(), newEquivType); + newAttrType->getModifiedType(), newEquivType, + newAttrType->getAttr()); } }; @@ -4115,6 +4116,19 @@ bool RecordType::hasConstFields() const { return false; } +AttributedType::AttributedType(QualType canon, const Attr *attr, + QualType modified, QualType equivalent) + : AttributedType(canon, attr->getKind(), attr, modified, equivalent) {} + +AttributedType::AttributedType(QualType canon, attr::Kind attrKind, + const Attr *attr, QualType modified, + QualType equivalent) + : Type(Attributed, canon, equivalent->getDependence()), Attribute(attr), + ModifiedType(modified), EquivalentType(equivalent) { + AttributedTypeBits.AttrKind = attrKind; + assert(!attr || attr->getKind() == attrKind); +} + bool AttributedType::isQualifier() const { // FIXME: Generate this with TableGen. switch (getAttrKind()) { diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 008e87e7e5c14..6d8db5cf4ffd2 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -1934,6 +1934,14 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, return; } + if (T->getAttrKind() == attr::SwiftAttr) { + if (auto *swiftAttr = dyn_cast_or_null(T->getAttr())) { + OS << " __attribute__((swift_attr(\"" << swiftAttr->getAttribute() + << "\")))"; + } + return; + } + OS << " __attribute__(("; switch (T->getAttrKind()) { #define TYPE_ATTR(NAME) @@ -1994,6 +2002,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::NonAllocating: case attr::Blocking: case attr::Allocating: + case attr::SwiftAttr: llvm_unreachable("This attribute should have been handled already"); case attr::NSReturnsRetained: diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index fad2f52e89ef1..2c68409b846bc 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -171,6 +171,12 @@ class MatchDescendantVisitor return VisitorBase::TraverseCXXTypeidExpr(Node); } + bool TraverseCXXDefaultInitExpr(CXXDefaultInitExpr *Node) { + if (!TraverseStmt(Node->getExpr())) + return false; + return VisitorBase::TraverseCXXDefaultInitExpr(Node); + } + bool TraverseStmt(Stmt *Node, DataRecursionQueue *Queue = nullptr) { if (!Node) return true; @@ -1972,14 +1978,18 @@ class DerefSimplePtrArithFixableGadget : public FixableGadget { }; /// Scan the function and return a list of gadgets found with provided kits. -static std::tuple -findGadgets(const Decl *D, const UnsafeBufferUsageHandler &Handler, - bool EmitSuggestions) { +static void findGadgets(const Stmt *S, ASTContext &Ctx, + const UnsafeBufferUsageHandler &Handler, + bool EmitSuggestions, FixableGadgetList &FixableGadgets, + WarningGadgetList &WarningGadgets, + DeclUseTracker &Tracker) { struct GadgetFinderCallback : MatchFinder::MatchCallback { - FixableGadgetList FixableGadgets; - WarningGadgetList WarningGadgets; - DeclUseTracker Tracker; + GadgetFinderCallback(FixableGadgetList &FixableGadgets, + WarningGadgetList &WarningGadgets, + DeclUseTracker &Tracker) + : FixableGadgets(FixableGadgets), WarningGadgets(WarningGadgets), + Tracker(Tracker) {} void run(const MatchFinder::MatchResult &Result) override { // In debug mode, assert that we've found exactly one gadget. @@ -2020,10 +2030,14 @@ findGadgets(const Decl *D, const UnsafeBufferUsageHandler &Handler, assert(numFound >= 1 && "Gadgets not found in match result!"); assert(numFound <= 1 && "Conflicting bind tags in gadgets!"); } + + FixableGadgetList &FixableGadgets; + WarningGadgetList &WarningGadgets; + DeclUseTracker &Tracker; }; MatchFinder M; - GadgetFinderCallback CB; + GadgetFinderCallback CB{FixableGadgets, WarningGadgets, Tracker}; // clang-format off M.addMatcher( @@ -2068,9 +2082,7 @@ findGadgets(const Decl *D, const UnsafeBufferUsageHandler &Handler, // clang-format on } - M.match(*D->getBody(), D->getASTContext()); - return {std::move(CB.FixableGadgets), std::move(CB.WarningGadgets), - std::move(CB.Tracker)}; + M.match(*S, Ctx); } // Compares AST nodes by source locations. @@ -3614,39 +3626,9 @@ class VariableGroupsManagerImpl : public VariableGroupsManager { } }; -void clang::checkUnsafeBufferUsage(const Decl *D, - UnsafeBufferUsageHandler &Handler, - bool EmitSuggestions) { -#ifndef NDEBUG - Handler.clearDebugNotes(); -#endif - - assert(D && D->getBody()); - // We do not want to visit a Lambda expression defined inside a method - // independently. Instead, it should be visited along with the outer method. - // FIXME: do we want to do the same thing for `BlockDecl`s? - if (const auto *fd = dyn_cast(D)) { - if (fd->getParent()->isLambda() && fd->getParent()->isLocalClass()) - return; - } - - // Do not emit fixit suggestions for functions declared in an - // extern "C" block. - if (const auto *FD = dyn_cast(D)) { - for (FunctionDecl *FReDecl : FD->redecls()) { - if (FReDecl->isExternC()) { - EmitSuggestions = false; - break; - } - } - } - - WarningGadgetSets UnsafeOps; - FixableGadgetSets FixablesForAllVars; - - auto [FixableGadgets, WarningGadgets, Tracker] = - findGadgets(D, Handler, EmitSuggestions); - +void applyGadgets(const Decl *D, FixableGadgetList FixableGadgets, + WarningGadgetList WarningGadgets, DeclUseTracker Tracker, + UnsafeBufferUsageHandler &Handler, bool EmitSuggestions) { if (!EmitSuggestions) { // Our job is very easy without suggestions. Just warn about // every problematic operation and consider it done. No need to deal @@ -3690,8 +3672,10 @@ void clang::checkUnsafeBufferUsage(const Decl *D, if (WarningGadgets.empty()) return; - UnsafeOps = groupWarningGadgetsByVar(std::move(WarningGadgets)); - FixablesForAllVars = groupFixablesByVar(std::move(FixableGadgets)); + WarningGadgetSets UnsafeOps = + groupWarningGadgetsByVar(std::move(WarningGadgets)); + FixableGadgetSets FixablesForAllVars = + groupFixablesByVar(std::move(FixableGadgets)); std::map FixItsForVariableGroup; @@ -3912,3 +3896,56 @@ void clang::checkUnsafeBufferUsage(const Decl *D, } } } + +void clang::checkUnsafeBufferUsage(const Decl *D, + UnsafeBufferUsageHandler &Handler, + bool EmitSuggestions) { +#ifndef NDEBUG + Handler.clearDebugNotes(); +#endif + + assert(D); + + SmallVector Stmts; + + if (const auto *FD = dyn_cast(D)) { + // We do not want to visit a Lambda expression defined inside a method + // independently. Instead, it should be visited along with the outer method. + // FIXME: do we want to do the same thing for `BlockDecl`s? + if (const auto *MD = dyn_cast(D)) { + if (MD->getParent()->isLambda() && MD->getParent()->isLocalClass()) + return; + } + + for (FunctionDecl *FReDecl : FD->redecls()) { + if (FReDecl->isExternC()) { + // Do not emit fixit suggestions for functions declared in an + // extern "C" block. + EmitSuggestions = false; + break; + } + } + + Stmts.push_back(FD->getBody()); + + if (const auto *ID = dyn_cast(D)) { + for (const CXXCtorInitializer *CI : ID->inits()) { + Stmts.push_back(CI->getInit()); + } + } + } else if (isa(D) || isa(D)) { + Stmts.push_back(D->getBody()); + } + + assert(!Stmts.empty()); + + FixableGadgetList FixableGadgets; + WarningGadgetList WarningGadgets; + DeclUseTracker Tracker; + for (Stmt *S : Stmts) { + findGadgets(S, D->getASTContext(), Handler, EmitSuggestions, FixableGadgets, + WarningGadgets, Tracker); + } + applyGadgets(D, std::move(FixableGadgets), std::move(WarningGadgets), + std::move(Tracker), Handler, EmitSuggestions); +} diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index d45bb0f392d45..fc8bd29faa080 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -575,6 +575,12 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc, DiagID != diag::fatal_too_many_errors && Diag.FatalsAsError) Result = diag::Severity::Error; + // Rest of the mappings are only applicable for diagnostics associated with a + // SourceLocation, bail out early for others. + if (!Diag.hasSourceManager()) + return Result; + + const auto &SM = Diag.getSourceManager(); // Custom diagnostics always are emitted in system headers. bool ShowInSystemHeader = !GetDiagInfo(DiagID) || GetDiagInfo(DiagID)->WarnShowInSystemHeader; @@ -583,15 +589,14 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc, // because we also want to ignore extensions and warnings in -Werror and // -pedantic-errors modes, which *map* warnings/extensions to errors. if (State->SuppressSystemWarnings && !ShowInSystemHeader && Loc.isValid() && - Diag.getSourceManager().isInSystemHeader( - Diag.getSourceManager().getExpansionLoc(Loc))) + SM.isInSystemHeader(SM.getExpansionLoc(Loc))) return diag::Severity::Ignored; // We also ignore warnings due to system macros bool ShowInSystemMacro = !GetDiagInfo(DiagID) || GetDiagInfo(DiagID)->WarnShowInSystemMacro; if (State->SuppressSystemWarnings && !ShowInSystemMacro && Loc.isValid() && - Diag.getSourceManager().isInSystemMacro(Loc)) + SM.isInSystemMacro(Loc)) return diag::Severity::Ignored; return Result; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 3d8de0294d4ba..e35ee2b7b9c38 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -1714,7 +1714,7 @@ void DarwinAArch64TargetInfo::getOSDefines(const LangOptions &Opts, if (Triple.isArm64e()) Builder.defineMacro("__arm64e__", "1"); - getDarwinDefines(Builder, Opts, Triple, PlatformName, PlatformMinVersion); + DarwinTargetInfo::getOSDefines(Opts, Triple, Builder); } TargetInfo::BuiltinVaListKind diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 82d29ea9fea5c..d7d3adef42c79 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -428,6 +428,10 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAMXTILE = true; } else if (Feature == "+amx-complex") { HasAMXCOMPLEX = true; + } else if (Feature == "+amx-fp8") { + HasAMXFP8 = true; + } else if (Feature == "+amx-transpose") { + HasAMXTRANSPOSE = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -947,6 +951,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP16__"); if (HasAMXCOMPLEX) Builder.defineMacro("__AMX_COMPLEX__"); + if (HasAMXFP8) + Builder.defineMacro("__AMX_FP8__"); + if (HasAMXTRANSPOSE) + Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -1075,8 +1083,10 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-bf16", true) .Case("amx-complex", true) .Case("amx-fp16", true) + .Case("amx-fp8", true) .Case("amx-int8", true) .Case("amx-tile", true) + .Case("amx-transpose", true) .Case("avx", true) .Case("avx10.1-256", true) .Case("avx10.1-512", true) @@ -1193,8 +1203,10 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-bf16", HasAMXBF16) .Case("amx-complex", HasAMXCOMPLEX) .Case("amx-fp16", HasAMXFP16) + .Case("amx-fp8", HasAMXFP8) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) + .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) .Case("avx10.1-256", HasAVX10_1) .Case("avx10.1-512", HasAVX10_1_512) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index e8aad3ec5a74b..e2eba63b99235 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -157,6 +157,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXINT8 = false; bool HasAMXBF16 = false; bool HasAMXCOMPLEX = false; + bool HasAMXFP8 = false; + bool HasAMXTRANSPOSE = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUSERMSR = false; diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index f018130807519..ae33554a66b6b 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1013,9 +1013,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (IsThinLTOPostLink) PB.registerPipelineStartEPCallback( [](ModulePassManager &MPM, OptimizationLevel Level) { - MPM.addPass(LowerTypeTestsPass(/*ExportSummary=*/nullptr, - /*ImportSummary=*/nullptr, - /*DropTypeTests=*/true)); + MPM.addPass(LowerTypeTestsPass( + /*ExportSummary=*/nullptr, + /*ImportSummary=*/nullptr, + /*DropTypeTests=*/lowertypetests::DropTestKind::Assume)); }); // Register callbacks to schedule sanitizer passes at the appropriate part diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp index 41bb8d19d161e..bfa9b0a2f836b 100644 --- a/clang/lib/CodeGen/CGBlocks.cpp +++ b/clang/lib/CodeGen/CGBlocks.cpp @@ -2590,10 +2590,6 @@ const BlockByrefInfo &CodeGenFunction::getBlockByrefInfo(const VarDecl *D) { if (it != BlockByrefInfos.end()) return it->second; - llvm::StructType *byrefType = - llvm::StructType::create(getLLVMContext(), - "struct.__block_byref_" + D->getNameAsString()); - QualType Ty = D->getType(); CharUnits size; @@ -2658,7 +2654,9 @@ const BlockByrefInfo &CodeGenFunction::getBlockByrefInfo(const VarDecl *D) { } types.push_back(varTy); - byrefType->setBody(types, packed); + llvm::StructType *byrefType = llvm::StructType::create( + getLLVMContext(), types, "struct.__block_byref_" + D->getNameAsString(), + packed); BlockByrefInfo info; info.Type = byrefType; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 65d7f5c54a191..34fedd6711475 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16994,6 +16994,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // instruction, but it will create a memset that won't be optimized away. return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true); } + // Corresponding to intrisics which will return 2 tiles (tile0_tile1). + case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1_internal: + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: { + Intrinsic::ID IID; + switch (BuiltinID) { + default: + llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_t2rpntlvwz0_internal: + IID = Intrinsic::x86_t2rpntlvwz0_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal: + IID = Intrinsic::x86_t2rpntlvwz0t1_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz1_internal: + IID = Intrinsic::x86_t2rpntlvwz1_internal; + break; + case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: + IID = Intrinsic::x86_t2rpntlvwz1t1_internal; + break; + } + + // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride) + Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), + {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]}); + + auto *PtrTy = E->getArg(3)->getType()->getAs(); + assert(PtrTy && "arg3 must be of pointer type"); + QualType PtreeTy = PtrTy->getPointeeType(); + llvm::Type *TyPtee = ConvertType(PtreeTy); + + // Bitcast amx type (x86_amx) to vector type (256 x i32) + // Then store tile0 into DstPtr0 + Value *T0 = Builder.CreateExtractValue(Call, 0); + Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, + {TyPtee}, {T0}); + Builder.CreateDefaultAlignedStore(VecT0, Ops[3]); + + // Then store tile1 into DstPtr1 + Value *T1 = Builder.CreateExtractValue(Call, 1); + Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, + {TyPtee}, {T1}); + Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); + + // Note: Here we escape directly use x86_tilestored64_internal to store + // the results due to it can't make sure the Mem written scope. This may + // cause shapes reloads after first amx intrinsic, which current amx reg- + // ister allocation has no ability to handle it. + + return Store; + } case X86::BI__ud2: // llvm.trap makes a ud2a instruction on x86. return EmitTrapCall(Intrinsic::trap); @@ -19115,8 +19167,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::amdgcn_ds_swizzle); case AMDGPU::BI__builtin_amdgcn_mov_dpp8: - return emitBuiltinWithOneOverloadedType<2>(*this, E, - Intrinsic::amdgcn_mov_dpp8); case AMDGPU::BI__builtin_amdgcn_mov_dpp: case AMDGPU::BI__builtin_amdgcn_update_dpp: { llvm::SmallVector Args; @@ -19130,14 +19180,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, unsigned Size = DataTy->getPrimitiveSizeInBits(); llvm::Type *IntTy = llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u)); - Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, IntTy); - assert(E->getNumArgs() == 5 || E->getNumArgs() == 6); - bool InsertOld = E->getNumArgs() == 5; + Function *F = + CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8 + ? Intrinsic::amdgcn_mov_dpp8 + : Intrinsic::amdgcn_update_dpp, + IntTy); + assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 || + E->getNumArgs() == 2); + bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp; if (InsertOld) Args.push_back(llvm::PoisonValue::get(IntTy)); for (unsigned I = 0; I != E->getNumArgs(); ++I) { llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E); - if (I <= (InsertOld ? 0u : 1u) && Size < 32) { + if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) && + Size < 32) { if (!DataTy->isIntegerTy()) V = Builder.CreateBitCast( V, llvm::IntegerType::get(Builder.getContext(), Size)); @@ -22394,10 +22450,6 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID, default: llvm_unreachable("unexpected builtin ID"); case RISCV::BI__builtin_riscv_orc_b_32: case RISCV::BI__builtin_riscv_orc_b_64: - case RISCV::BI__builtin_riscv_clz_32: - case RISCV::BI__builtin_riscv_clz_64: - case RISCV::BI__builtin_riscv_ctz_32: - case RISCV::BI__builtin_riscv_ctz_64: case RISCV::BI__builtin_riscv_clmul_32: case RISCV::BI__builtin_riscv_clmul_64: case RISCV::BI__builtin_riscv_clmulh_32: @@ -22419,24 +22471,6 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID, case RISCV::BI__builtin_riscv_orc_b_64: ID = Intrinsic::riscv_orc_b; break; - case RISCV::BI__builtin_riscv_clz_32: - case RISCV::BI__builtin_riscv_clz_64: { - Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType()); - Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); - if (Result->getType() != ResultType) - Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true, - "cast"); - return Result; - } - case RISCV::BI__builtin_riscv_ctz_32: - case RISCV::BI__builtin_riscv_ctz_64: { - Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType()); - Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); - if (Result->getType() != ResultType) - Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true, - "cast"); - return Result; - } // Zbc case RISCV::BI__builtin_riscv_clmul_32: @@ -22511,6 +22545,25 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID, ID = Intrinsic::riscv_sm3p1; break; + case RISCV::BI__builtin_riscv_clz_32: + case RISCV::BI__builtin_riscv_clz_64: { + Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType()); + Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); + if (Result->getType() != ResultType) + Result = + Builder.CreateIntCast(Result, ResultType, /*isSigned*/ false, "cast"); + return Result; + } + case RISCV::BI__builtin_riscv_ctz_32: + case RISCV::BI__builtin_riscv_ctz_64: { + Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType()); + Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); + if (Result->getType() != ResultType) + Result = + Builder.CreateIntCast(Result, ResultType, /*isSigned*/ false, "cast"); + return Result; + } + // Zihintntl case RISCV::BI__builtin_riscv_ntl_load: { llvm::Type *ResTy = ConvertType(E->getType()); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 06558ce796f2e..7ba0d61501818 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -306,6 +306,16 @@ void CGHLSLRuntime::annotateHLSLResource(const VarDecl *D, GlobalVariable *GV) { continue; llvm::hlsl::ResourceClass RC = AttrResType->getAttrs().ResourceClass; + if (RC == llvm::hlsl::ResourceClass::UAV || + RC == llvm::hlsl::ResourceClass::SRV) + // UAVs and SRVs have already been converted to use LLVM target types, + // we can disable generating of these resource annotations. This will + // enable progress on structured buffers with user defined types this + // resource annotations code does not handle and it crashes. + // This whole function is going to be removed as soon as cbuffers are + // converted to target types (llvm/llvm-project #114126). + return; + bool IsROV = AttrResType->getAttrs().IsROV; llvm::hlsl::ResourceKind RK = HLSLResAttr->getResourceKind(); llvm::hlsl::ElementType ET = calculateElementType(CGM.getContext(), Ty); diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp index 7a07284f8a8aa..d6f5f2a43cf51 100644 --- a/clang/lib/CodeGen/CGObjCGNU.cpp +++ b/clang/lib/CodeGen/CGObjCGNU.cpp @@ -1509,8 +1509,8 @@ class CGObjCGNUstep2 : public CGObjCGNUstep { GetSectionBounds(StringRef Section) { if (CGM.getTriple().isOSBinFormatCOFF()) { if (emptyStruct == nullptr) { - emptyStruct = llvm::StructType::create(VMContext, ".objc_section_sentinel"); - emptyStruct->setBody({}, /*isPacked*/true); + emptyStruct = llvm::StructType::create( + VMContext, {}, ".objc_section_sentinel", /*isPacked=*/true); } auto ZeroInit = llvm::Constant::getNullValue(emptyStruct); auto Sym = [&](StringRef Prefix, StringRef SecSuffix) { diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 1c16d273a5535..47ea636c75643 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -5835,15 +5835,7 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) // struct _objc_protocol_extension * ProtocolExtensionPtrTy = llvm::PointerType::getUnqual(ProtocolExtensionTy); - // Handle recursive construction of Protocol and ProtocolList types - - ProtocolTy = - llvm::StructType::create(VMContext, "struct._objc_protocol"); - - ProtocolListTy = - llvm::StructType::create(VMContext, "struct._objc_protocol_list"); - ProtocolListTy->setBody(llvm::PointerType::getUnqual(ProtocolListTy), LongTy, - llvm::ArrayType::get(ProtocolTy, 0)); + // Handle construction of Protocol and ProtocolList types // struct _objc_protocol { // struct _objc_protocol_extension *isa; @@ -5852,9 +5844,16 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) // struct _objc_method_description_list *instance_methods; // struct _objc_method_description_list *class_methods; // } - ProtocolTy->setBody(ProtocolExtensionPtrTy, Int8PtrTy, - llvm::PointerType::getUnqual(ProtocolListTy), - MethodDescriptionListPtrTy, MethodDescriptionListPtrTy); + ProtocolTy = llvm::StructType::create( + {ProtocolExtensionPtrTy, Int8PtrTy, + llvm::PointerType::getUnqual(VMContext), MethodDescriptionListPtrTy, + MethodDescriptionListPtrTy}, + "struct._objc_protocol"); + + ProtocolListTy = + llvm::StructType::create({llvm::PointerType::getUnqual(VMContext), LongTy, + llvm::ArrayType::get(ProtocolTy, 0)}, + "struct._objc_protocol_list"); // struct _objc_protocol_list * ProtocolListPtrTy = llvm::PointerType::getUnqual(ProtocolListTy); @@ -5886,8 +5885,6 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) "struct._objc_class_extension", IntTy, Int8PtrTy, PropertyListPtrTy); ClassExtensionPtrTy = llvm::PointerType::getUnqual(ClassExtensionTy); - ClassTy = llvm::StructType::create(VMContext, "struct._objc_class"); - // struct _objc_class { // Class isa; // Class super_class; @@ -5902,10 +5899,12 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) // char *ivar_layout; // struct _objc_class_ext *ext; // }; - ClassTy->setBody(llvm::PointerType::getUnqual(ClassTy), - llvm::PointerType::getUnqual(ClassTy), Int8PtrTy, LongTy, - LongTy, LongTy, IvarListPtrTy, MethodListPtrTy, CachePtrTy, - ProtocolListPtrTy, Int8PtrTy, ClassExtensionPtrTy); + ClassTy = llvm::StructType::create( + {llvm::PointerType::getUnqual(VMContext), + llvm::PointerType::getUnqual(VMContext), Int8PtrTy, LongTy, LongTy, + LongTy, IvarListPtrTy, MethodListPtrTy, CachePtrTy, ProtocolListPtrTy, + Int8PtrTy, ClassExtensionPtrTy}, + "struct._objc_class"); ClassPtrTy = llvm::PointerType::getUnqual(ClassTy); @@ -5988,13 +5987,9 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(CodeGen::CodeGenModul // const struct _prop_list_t * class_properties; // } - // Holder for struct _protocol_list_t * - ProtocolListnfABITy = - llvm::StructType::create(VMContext, "struct._objc_protocol_list"); - ProtocolnfABITy = llvm::StructType::create( "struct._protocol_t", ObjectPtrTy, Int8PtrTy, - llvm::PointerType::getUnqual(ProtocolListnfABITy), MethodListnfABIPtrTy, + llvm::PointerType::getUnqual(VMContext), MethodListnfABIPtrTy, MethodListnfABIPtrTy, MethodListnfABIPtrTy, MethodListnfABIPtrTy, PropertyListPtrTy, IntTy, IntTy, Int8PtrPtrTy, Int8PtrTy, PropertyListPtrTy); @@ -6006,8 +6001,9 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(CodeGen::CodeGenModul // long protocol_count; // Note, this is 32/64 bit // struct _protocol_t *[protocol_count]; // } - ProtocolListnfABITy->setBody(LongTy, - llvm::ArrayType::get(ProtocolnfABIPtrTy, 0)); + ProtocolListnfABITy = llvm::StructType::create( + {LongTy, llvm::ArrayType::get(ProtocolnfABIPtrTy, 0)}, + "struct._objc_protocol_list"); // struct _objc_protocol_list* ProtocolListnfABIPtrTy = llvm::PointerType::getUnqual(ProtocolListnfABITy); @@ -6067,11 +6063,12 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(CodeGen::CodeGenModul // struct class_ro_t *ro; // } - ClassnfABITy = llvm::StructType::create(VMContext, "struct._class_t"); - ClassnfABITy->setBody(llvm::PointerType::getUnqual(ClassnfABITy), - llvm::PointerType::getUnqual(ClassnfABITy), CachePtrTy, - llvm::PointerType::getUnqual(ImpnfABITy), - llvm::PointerType::getUnqual(ClassRonfABITy)); + ClassnfABITy = llvm::StructType::create( + {llvm::PointerType::getUnqual(VMContext), + llvm::PointerType::getUnqual(VMContext), CachePtrTy, + llvm::PointerType::getUnqual(ImpnfABITy), + llvm::PointerType::getUnqual(ClassRonfABITy)}, + "struct._class_t"); // LLVM for struct _class_t * ClassnfABIPtrTy = llvm::PointerType::getUnqual(ClassnfABITy); diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index 0b0b45ffead92..3802dc8bcafc4 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -529,31 +529,29 @@ class MicrosoftCXXABI : public CGCXXABI { if (ClassHierarchyDescriptorType) return ClassHierarchyDescriptorType; // Forward-declare RTTIClassHierarchyDescriptor to break a cycle. - ClassHierarchyDescriptorType = llvm::StructType::create( - CGM.getLLVMContext(), "rtti.ClassHierarchyDescriptor"); llvm::Type *FieldTypes[] = {CGM.IntTy, CGM.IntTy, CGM.IntTy, getImageRelativeType(CGM.UnqualPtrTy)}; - ClassHierarchyDescriptorType->setBody(FieldTypes); + ClassHierarchyDescriptorType = + llvm::StructType::create(FieldTypes, "rtti.ClassHierarchyDescriptor"); return ClassHierarchyDescriptorType; } llvm::StructType *getCompleteObjectLocatorType() { if (CompleteObjectLocatorType) return CompleteObjectLocatorType; - CompleteObjectLocatorType = llvm::StructType::create( - CGM.getLLVMContext(), "rtti.CompleteObjectLocator"); llvm::Type *FieldTypes[] = { CGM.IntTy, CGM.IntTy, CGM.IntTy, getImageRelativeType(CGM.Int8PtrTy), getImageRelativeType(CGM.UnqualPtrTy), - getImageRelativeType(CompleteObjectLocatorType), + getImageRelativeType(CGM.VoidTy), }; llvm::ArrayRef FieldTypesRef(FieldTypes); if (!isImageRelative()) FieldTypesRef = FieldTypesRef.drop_back(); - CompleteObjectLocatorType->setBody(FieldTypesRef); + CompleteObjectLocatorType = + llvm::StructType::create(FieldTypesRef, "rtti.CompleteObjectLocator"); return CompleteObjectLocatorType; } diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 9878a9dad78d4..083035dee4302 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4791,6 +4791,11 @@ Action *Driver::ConstructPhaseAction( if (Phase == phases::Assemble && Input->getType() != types::TY_PP_Asm) return Input; + // Use of --sycl-link will only allow for the link phase to occur. This is + // for all input files. + if (Args.hasArg(options::OPT_sycl_link) && Phase != phases::Link) + return Input; + // Build the appropriate action. switch (Phase) { case phases::Link: diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 34de0043ca012..bdf3da0c96adc 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -109,7 +109,8 @@ ToolChain::ToolChain(const Driver &D, const llvm::Triple &T, llvm::Expected> ToolChain::executeToolChainProgram(StringRef Executable) const { llvm::SmallString<64> OutputFile; - llvm::sys::fs::createTemporaryFile("toolchain-program", "txt", OutputFile); + llvm::sys::fs::createTemporaryFile("toolchain-program", "txt", OutputFile, + llvm::sys::fs::OF_Text); llvm::FileRemover OutputRemover(OutputFile.c_str()); std::optional Redirects[] = { {""}, @@ -128,7 +129,8 @@ ToolChain::executeToolChainProgram(StringRef Executable) const { *Str + "'"); SecondsToWait = std::max(SecondsToWait, 0); // infinite } - if (llvm::sys::ExecuteAndWait(Executable, {}, {}, Redirects, SecondsToWait, + if (llvm::sys::ExecuteAndWait(Executable, {Executable}, {}, Redirects, + SecondsToWait, /*MemoryLimit=*/0, &ErrorMessage)) return llvm::createStringError(std::error_code(), Executable + ": " + ErrorMessage); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 4c6f508f1f24a..dca8d3fd7b3ea 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -649,6 +649,8 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C, CmdArgs.push_back(Args.MakeArgString( Twine("--instrument-cold-function-only-path=") + Path)); CmdArgs.push_back("-mllvm"); + CmdArgs.push_back("--pgo-instrument-cold-function-only"); + CmdArgs.push_back("-mllvm"); CmdArgs.push_back("--pgo-function-entry-coverage"); } @@ -1205,8 +1207,7 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, Args.addAllArgs(CmdArgs, {options::OPT_D, options::OPT_U, options::OPT_I_Group, - options::OPT_F, options::OPT_index_header_map, - options::OPT_embed_dir_EQ}); + options::OPT_F, options::OPT_embed_dir_EQ}); // Add -Wp, and -Xpreprocessor if using the preprocessor. diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp index 9daafbe703f68..877cb4e7b1261 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp @@ -241,12 +241,16 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, // handled somewhere else. Args.ClaimAllArgs(options::OPT_w); + CmdArgs.push_back("-m"); + CmdArgs.push_back("elf_x86_64_fbsd"); + CmdArgs.push_back( Args.MakeArgString("--sysroot=" + TC.getSDKLibraryRootDir())); // Default to PIE for non-static executables. - const bool PIE = !Relocatable && !Shared && !Static; - if (Args.hasFlag(options::OPT_pie, options::OPT_no_pie, PIE)) + const bool PIE = Args.hasFlag(options::OPT_pie, options::OPT_no_pie, + !Relocatable && !Shared && !Static); + if (PIE) CmdArgs.push_back("-pie"); if (!Relocatable) { @@ -273,6 +277,12 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-z"); CmdArgs.push_back("start-stop-visibility=hidden"); + CmdArgs.push_back("-z"); + CmdArgs.push_back("common-page-size=0x4000"); + + CmdArgs.push_back("-z"); + CmdArgs.push_back("max-page-size=0x4000"); + // Patch relocated regions of DWARF whose targets are eliminated at link // time with specific tombstones, such that they're recognisable by the // PlayStation debugger. @@ -292,6 +302,11 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (Shared) CmdArgs.push_back("--shared"); + // Provide a base address for non-PIE executables. This includes cases where + // -static is supplied without -pie. + if (!Relocatable && !Shared && !PIE) + CmdArgs.push_back("--image-base=0x400000"); + assert((Output.isFilename() || Output.isNothing()) && "Invalid output."); if (Output.isFilename()) { CmdArgs.push_back("-o"); diff --git a/clang/lib/Driver/ToolChains/SPIRV.cpp b/clang/lib/Driver/ToolChains/SPIRV.cpp index ce900600cbee5..659da5c7f25aa 100644 --- a/clang/lib/Driver/ToolChains/SPIRV.cpp +++ b/clang/lib/Driver/ToolChains/SPIRV.cpp @@ -95,7 +95,21 @@ void SPIRV::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); + // Use of --sycl-link will call the clang-sycl-linker instead of + // the default linker (spirv-link). + if (Args.hasArg(options::OPT_sycl_link)) + Linker = ToolChain.GetProgramPath("clang-sycl-linker"); C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), Args.MakeArgString(Linker), CmdArgs, Inputs, Output)); } + +SPIRVToolChain::SPIRVToolChain(const Driver &D, const llvm::Triple &Triple, + const ArgList &Args) + : ToolChain(D, Triple, Args) { + // TODO: Revisit need/use of --sycl-link option once SYCL toolchain is + // available and SYCL linking support is moved there. + NativeLLVMSupport = Args.hasArg(options::OPT_sycl_link); +} + +bool SPIRVToolChain::HasNativeLLVMSupport() const { return NativeLLVMSupport; } diff --git a/clang/lib/Driver/ToolChains/SPIRV.h b/clang/lib/Driver/ToolChains/SPIRV.h index d4247ee0557f4..d59a8c76ed473 100644 --- a/clang/lib/Driver/ToolChains/SPIRV.h +++ b/clang/lib/Driver/ToolChains/SPIRV.h @@ -57,8 +57,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRVToolChain final : public ToolChain { public: SPIRVToolChain(const Driver &D, const llvm::Triple &Triple, - const llvm::opt::ArgList &Args) - : ToolChain(D, Triple, Args) {} + const llvm::opt::ArgList &Args); bool useIntegratedAs() const override { return true; } @@ -72,6 +71,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRVToolChain final : public ToolChain { } bool isPICDefaultForced() const override { return false; } bool SupportsProfiling() const override { return false; } + bool HasNativeLLVMSupport() const override; clang::driver::Tool *SelectTool(const JobAction &JA) const override; @@ -81,6 +81,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRVToolChain final : public ToolChain { private: clang::driver::Tool *getTranslator() const; + bool NativeLLVMSupport; }; } // namespace toolchains diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index d8261e12b08b5..b5fd35aaa1e84 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3190,15 +3190,10 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts, auto It = Opts.UserEntries.begin(); auto End = Opts.UserEntries.end(); - // Add -I..., -F..., and -index-header-map options in order. - for (; It < End && Matches(*It, {frontend::IndexHeaderMap, frontend::Angled}, - std::nullopt, true); + // Add -I... and -F... options in order. + for (; It < End && Matches(*It, {frontend::Angled}, std::nullopt, true); ++It) { OptSpecifier Opt = [It, Matches]() { - if (Matches(*It, frontend::IndexHeaderMap, true, true)) - return OPT_F; - if (Matches(*It, frontend::IndexHeaderMap, false, true)) - return OPT_I; if (Matches(*It, frontend::Angled, true, true)) return OPT_F; if (Matches(*It, frontend::Angled, false, true)) @@ -3206,8 +3201,6 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts, llvm_unreachable("Unexpected HeaderSearchOptions::Entry."); }(); - if (It->Group == frontend::IndexHeaderMap) - GenerateArg(Consumer, OPT_index_header_map); GenerateArg(Consumer, Opt, It->Path); }; @@ -3319,8 +3312,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args, llvm::CachedHashString(MacroDef.split('=').first)); } - // Add -I..., -F..., and -index-header-map options in order. - bool IsIndexHeaderMap = false; + // Add -I... and -F... options in order. bool IsSysrootSpecified = Args.hasArg(OPT__sysroot_EQ) || Args.hasArg(OPT_isysroot); @@ -3339,20 +3331,10 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args, return A->getValue(); }; - for (const auto *A : Args.filtered(OPT_I, OPT_F, OPT_index_header_map)) { - if (A->getOption().matches(OPT_index_header_map)) { - // -index-header-map applies to the next -I or -F. - IsIndexHeaderMap = true; - continue; - } - - frontend::IncludeDirGroup Group = - IsIndexHeaderMap ? frontend::IndexHeaderMap : frontend::Angled; - + for (const auto *A : Args.filtered(OPT_I, OPT_F)) { bool IsFramework = A->getOption().matches(OPT_F); - Opts.AddPath(PrefixHeaderPath(A, IsFramework), Group, IsFramework, - /*IgnoreSysroot*/ true); - IsIndexHeaderMap = false; + Opts.AddPath(PrefixHeaderPath(A, IsFramework), frontend::Angled, + IsFramework, /*IgnoreSysroot=*/true); } // Add -iprefix/-iwithprefix/-iwithprefixbefore options. diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 0211d1870b30a..67242cd4d981b 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -148,7 +148,9 @@ set(x86_files ammintrin.h amxcomplexintrin.h amxfp16intrin.h + amxfp8intrin.h amxintrin.h + amxtransposeintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h new file mode 100644 index 0000000000000..0f5ddc87e5a75 --- /dev/null +++ b/clang/lib/Headers/amxfp8intrin.h @@ -0,0 +1,95 @@ +/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXFP8INTRIN_H +#define __AMXFP8INTRIN_H +#ifdef __x86_64__ + +/// Peform the dot product of a BF8 value \a a by a BF8 value \a b accumulating +/// into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_dpbf8ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPBF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b)) + +/// Perform the dot product of a BF8 value \a a by an HF8 value \a b +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPBHF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbhf8ps(dst, a, b) __builtin_ia32_tdpbhf8ps((dst), (a), (b)) + +/// Perform the dot product of an HF8 value \a a by a BF8 value \a b +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_dphbf8ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPHBF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dphbf8ps(dst, a, b) __builtin_ia32_tdphbf8ps((dst), (a), (b)) + +/// Perform the dot product of an HF8 value \a a by an HF8 value \a b +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_dphf8ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPHF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dphf8ps(dst, a, b) __builtin_ia32_tdphf8ps((dst), (a), (b)) + +#endif /* __x86_64__ */ +#endif /* __AMXFP8INTRIN_H */ diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index baa56f5b28e8e..f07a568901185 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -232,6 +232,8 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { /// bytes. Since there is no 2D type in llvm IR, we use vector type to /// represent 2D tile and the fixed size is maximum amx tile register size. typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); +typedef int _tile1024i_1024a + __attribute__((__vector_size__(1024), __aligned__(1024))); /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h new file mode 100644 index 0000000000000..b3fa37d766c45 --- /dev/null +++ b/clang/lib/Headers/amxtransposeintrin.h @@ -0,0 +1,248 @@ +/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; use instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_TRANSPOSEINTRIN_H +#define __AMX_TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_TRANSPOSE \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) + +#define _tile_2rpntlvwz0(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0(tdst, base, stride) +#define _tile_2rpntlvwz0t1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride) +#define _tile_2rpntlvwz1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1(tdst, base, stride) +#define _tile_2rpntlvwz1t1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) + +/// Transpose 32-bit elements from \a src and write the result to \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_transposed(__tile dst, __tile src); +/// \endcode +/// +/// This intrinsic corresponds to the TTRANSPOSED instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The source tile. Max size is 1024 Bytes. +/// +/// \code{.operation} +/// +/// FOR i := 0 TO (dst.rows-1) +/// tmp[511:0] := 0 +/// FOR j := 0 TO (dst.colsb/4-1) +/// tmp.dword[j] := src.row[j].dword[i] +/// ENDFOR +/// dst.row[i] := tmp +/// ENDFOR +/// +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src) + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0, + (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0t1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0, + (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1t1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE +_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { + return __builtin_ia32_ttransposed_internal(m, n, src); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0T1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1T1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Transpose 32-bit elements from src and write the result to dst. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTRANSPOSED instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_transposed(__tile1024i *dst, __tile1024i src) { + dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); +} + +#endif /* __x86_64__ */ +#endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/clang/lib/Headers/cmpccxaddintrin.h b/clang/lib/Headers/cmpccxaddintrin.h index 6957498996c89..0076c402f5ffc 100644 --- a/clang/lib/Headers/cmpccxaddintrin.h +++ b/clang/lib/Headers/cmpccxaddintrin.h @@ -63,7 +63,7 @@ typedef enum { (int)(__D)))) #define _cmpccxadd_epi64(__A, __B, __C, __D) \ - ((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B), \ + ((long long)(__builtin_ia32_cmpccxadd64((__A), (long long)(__B), \ (long long)(__C), (int)(__D)))) #endif // __x86_64__ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 65ad72bc479f4..4bf7eac4195ee 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -648,6 +648,14 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP8__) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp index 052be1395161d..c5614a8e0ee52 100644 --- a/clang/lib/Lex/HeaderSearch.cpp +++ b/clang/lib/Lex/HeaderSearch.cpp @@ -974,13 +974,9 @@ OptionalFileEntryRef HeaderSearch::LookupFile( const HeaderFileInfo *FromHFI = getExistingFileInfo(*Includer); assert(FromHFI && "includer without file info"); unsigned DirInfo = FromHFI->DirInfo; - bool IndexHeaderMapHeader = FromHFI->IndexHeaderMapHeader; - StringRef Framework = FromHFI->Framework; HeaderFileInfo &ToHFI = getFileInfo(*FE); ToHFI.DirInfo = DirInfo; - ToHFI.IndexHeaderMapHeader = IndexHeaderMapHeader; - ToHFI.Framework = Framework; if (SearchPath) { StringRef SearchPathRef(IncluderAndDir.second.getName()); @@ -1122,23 +1118,6 @@ OptionalFileEntryRef HeaderSearch::LookupFile( } } - // Set the `Framework` info if this file is in a header map with framework - // style include spelling or found in a framework dir. The header map case - // is possible when building frameworks which use header maps. - if (CurDir->isHeaderMap() && isAngled) { - size_t SlashPos = Filename.find('/'); - if (SlashPos != StringRef::npos) - HFI.Framework = - getUniqueFrameworkName(StringRef(Filename.begin(), SlashPos)); - if (CurDir->isIndexHeaderMap()) - HFI.IndexHeaderMapHeader = 1; - } else if (CurDir->isFramework()) { - size_t SlashPos = Filename.find('/'); - if (SlashPos != StringRef::npos) - HFI.Framework = - getUniqueFrameworkName(StringRef(Filename.begin(), SlashPos)); - } - if (checkMSVCHeaderSearch(Diags, MSFE, &File->getFileEntry(), IncludeLoc)) { if (SuggestedModule) *SuggestedModule = MSSuggestedModule; @@ -1156,41 +1135,6 @@ OptionalFileEntryRef HeaderSearch::LookupFile( return File; } - // If we are including a file with a quoted include "foo.h" from inside - // a header in a framework that is currently being built, and we couldn't - // resolve "foo.h" any other way, change the include to , where - // "Foo" is the name of the framework in which the including header was found. - if (!Includers.empty() && Includers.front().first && !isAngled && - !Filename.contains('/')) { - const HeaderFileInfo *IncludingHFI = - getExistingFileInfo(*Includers.front().first); - assert(IncludingHFI && "includer without file info"); - if (IncludingHFI->IndexHeaderMapHeader) { - SmallString<128> ScratchFilename; - ScratchFilename += IncludingHFI->Framework; - ScratchFilename += '/'; - ScratchFilename += Filename; - - OptionalFileEntryRef File = LookupFile( - ScratchFilename, IncludeLoc, /*isAngled=*/true, FromDir, &CurDir, - Includers.front(), SearchPath, RelativePath, RequestingModule, - SuggestedModule, IsMapped, /*IsFrameworkFound=*/nullptr); - - if (checkMSVCHeaderSearch(Diags, MSFE, - File ? &File->getFileEntry() : nullptr, - IncludeLoc)) { - if (SuggestedModule) - *SuggestedModule = MSSuggestedModule; - return MSFE; - } - - cacheLookupSuccess(LookupFileCache[Filename], - LookupFileCache[ScratchFilename].HitIt, IncludeLoc); - // FIXME: SuggestedModule. - return File; - } - } - if (checkMSVCHeaderSearch(Diags, MSFE, nullptr, IncludeLoc)) { if (SuggestedModule) *SuggestedModule = MSSuggestedModule; @@ -1358,10 +1302,6 @@ static void mergeHeaderFileInfo(HeaderFileInfo &HFI, HFI.DirInfo = OtherHFI.DirInfo; HFI.External = (!HFI.IsValid || HFI.External); HFI.IsValid = true; - HFI.IndexHeaderMapHeader = OtherHFI.IndexHeaderMapHeader; - - if (HFI.Framework.empty()) - HFI.Framework = OtherHFI.Framework; } HeaderFileInfo &HeaderSearch::getFileInfo(FileEntryRef FE) { diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp index 2218db15013d9..86c2ecdf9e36e 100644 --- a/clang/lib/Lex/InitHeaderSearch.cpp +++ b/clang/lib/Lex/InitHeaderSearch.cpp @@ -149,7 +149,7 @@ bool InitHeaderSearch::AddUnmappedPath(const Twine &Path, IncludeDirGroup Group, // Compute the DirectoryLookup type. SrcMgr::CharacteristicKind Type; - if (Group == Quoted || Group == Angled || Group == IndexHeaderMap) { + if (Group == Quoted || Group == Angled) { Type = SrcMgr::C_User; } else if (Group == ExternCSystem) { Type = SrcMgr::C_ExternCSystem; @@ -170,9 +170,8 @@ bool InitHeaderSearch::AddUnmappedPath(const Twine &Path, IncludeDirGroup Group, if (auto FE = FM.getOptionalFileRef(MappedPathStr)) { if (const HeaderMap *HM = Headers.CreateHeaderMap(*FE)) { // It is a headermap, add it to the search path. - IncludePath.emplace_back( - Group, DirectoryLookup(HM, Type, Group == IndexHeaderMap), - UserEntryIdx); + IncludePath.emplace_back(Group, DirectoryLookup(HM, Type), + UserEntryIdx); return true; } } @@ -488,7 +487,7 @@ void InitHeaderSearch::Realize(const LangOptions &Lang) { unsigned NumQuoted = SearchList.size(); for (auto &Include : IncludePath) - if (Include.Group == Angled || Include.Group == IndexHeaderMap) + if (Include.Group == Angled) SearchList.push_back(Include); RemoveDuplicates(SearchList, NumQuoted, Verbose); diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp index 28ccd3061f843..e69fa15248198 100644 --- a/clang/lib/Parse/ParseObjc.cpp +++ b/clang/lib/Parse/ParseObjc.cpp @@ -1454,7 +1454,7 @@ Decl *Parser::ParseObjCMethodDecl(SourceLocation mLoc, SmallVector KeyIdents; SmallVector KeyLocs; - SmallVector ArgInfos; + SmallVector ObjCParamInfo; ParseScope PrototypeScope(this, Scope::FunctionPrototypeScope | Scope::FunctionDeclarationScope | Scope::DeclScope); @@ -1495,7 +1495,9 @@ Decl *Parser::ParseObjCMethodDecl(SourceLocation mLoc, ArgInfo.NameLoc = Tok.getLocation(); ConsumeToken(); // Eat the identifier. - ArgInfos.push_back(ArgInfo); + ParmVarDecl *Param = Actions.ObjC().ActOnMethodParmDeclaration( + getCurScope(), ArgInfo, ObjCParamInfo.size(), MethodDefinition); + ObjCParamInfo.push_back(Param); KeyIdents.push_back(SelIdent); KeyLocs.push_back(selLoc); @@ -1567,8 +1569,8 @@ Decl *Parser::ParseObjCMethodDecl(SourceLocation mLoc, &KeyIdents[0]); Decl *Result = Actions.ObjC().ActOnMethodDeclaration( getCurScope(), mLoc, Tok.getLocation(), mType, DSRet, ReturnType, KeyLocs, - Sel, &ArgInfos[0], CParamInfo.data(), CParamInfo.size(), methodAttrs, - MethodImplKind, isVariadic, MethodDefinition); + Sel, ObjCParamInfo.data(), CParamInfo.data(), CParamInfo.size(), + methodAttrs, MethodImplKind, isVariadic, MethodDefinition); PD.complete(Result); return Result; diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 357082fe32935..7f9b484ef6c05 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -1261,12 +1261,12 @@ static void checkExprLifetimeImpl(Sema &SemaRef, if (pathContainsInit(Path)) return false; + auto *DRE = dyn_cast(L); // Suppress false positives for code like the one below: - // Ctor(unique_ptr up) : member(*up), member2(move(up)) {} - if (IsLocalGslOwner && pathOnlyHandlesGslPointer(Path)) + // Ctor(unique_ptr up) : pointer(up.get()), owner(move(up)) {} + if (DRE && isRecordWithAttr(DRE->getType())) return false; - auto *DRE = dyn_cast(L); auto *VD = DRE ? dyn_cast(DRE->getDecl()) : nullptr; if (!VD) { // A member was initialized to a local block. diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index f59654c14f08f..02710e51bc0d1 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -63,49 +63,12 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, OrderIndex = 0; ScopeIndex = 1; break; - case AMDGPU::BI__builtin_amdgcn_mov_dpp: { - if (SemaRef.checkArgCountRange(TheCall, 5, 5)) - return true; - Expr *ValArg = TheCall->getArg(0); - QualType Ty = ValArg->getType(); - // TODO: Vectors can also be supported. - if (!Ty->isArithmeticType() || Ty->isAnyComplexType()) { - SemaRef.Diag(ValArg->getBeginLoc(), - diag::err_typecheck_cond_expect_int_float) - << Ty << ValArg->getSourceRange(); - return true; - } - return false; - } + case AMDGPU::BI__builtin_amdgcn_mov_dpp: + return checkMovDPPFunctionCall(TheCall, 5, 1); + case AMDGPU::BI__builtin_amdgcn_mov_dpp8: + return checkMovDPPFunctionCall(TheCall, 2, 1); case AMDGPU::BI__builtin_amdgcn_update_dpp: { - if (SemaRef.checkArgCountRange(TheCall, 6, 6)) - return true; - Expr *Args[2]; - QualType ArgTys[2]; - for (unsigned I = 0; I != 2; ++I) { - Args[I] = TheCall->getArg(I); - ArgTys[I] = Args[I]->getType(); - // TODO: Vectors can also be supported. - if (!ArgTys[I]->isArithmeticType() || ArgTys[I]->isAnyComplexType()) { - SemaRef.Diag(Args[I]->getBeginLoc(), - diag::err_typecheck_cond_expect_int_float) - << ArgTys[I] << Args[I]->getSourceRange(); - return true; - } - } - if (getASTContext().hasSameUnqualifiedType(ArgTys[0], ArgTys[1])) - return false; - if (((ArgTys[0]->isUnsignedIntegerType() && - ArgTys[1]->isSignedIntegerType()) || - (ArgTys[0]->isSignedIntegerType() && - ArgTys[1]->isUnsignedIntegerType())) && - getASTContext().getTypeSize(ArgTys[0]) == - getASTContext().getTypeSize(ArgTys[1])) - return false; - SemaRef.Diag(Args[1]->getBeginLoc(), - diag::err_typecheck_call_different_arg_types) - << ArgTys[0] << ArgTys[1]; - return true; + return checkMovDPPFunctionCall(TheCall, 6, 2); } default: return false; @@ -152,6 +115,44 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, return false; } +bool SemaAMDGPU::checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs, + unsigned NumDataArgs) { + assert(NumDataArgs <= 2); + if (SemaRef.checkArgCountRange(TheCall, NumArgs, NumArgs)) + return true; + Expr *Args[2]; + QualType ArgTys[2]; + for (unsigned I = 0; I != NumDataArgs; ++I) { + Args[I] = TheCall->getArg(I); + ArgTys[I] = Args[I]->getType(); + // TODO: Vectors can also be supported. + if (!ArgTys[I]->isArithmeticType() || ArgTys[I]->isAnyComplexType()) { + SemaRef.Diag(Args[I]->getBeginLoc(), + diag::err_typecheck_cond_expect_int_float) + << ArgTys[I] << Args[I]->getSourceRange(); + return true; + } + } + if (NumDataArgs < 2) + return false; + + if (getASTContext().hasSameUnqualifiedType(ArgTys[0], ArgTys[1])) + return false; + + if (((ArgTys[0]->isUnsignedIntegerType() && + ArgTys[1]->isSignedIntegerType()) || + (ArgTys[0]->isSignedIntegerType() && + ArgTys[1]->isUnsignedIntegerType())) && + getASTContext().getTypeSize(ArgTys[0]) == + getASTContext().getTypeSize(ArgTys[1])) + return false; + + SemaRef.Diag(Args[1]->getBeginLoc(), + diag::err_typecheck_call_different_arg_types) + << ArgTys[0] << ArgTys[1]; + return true; +} + static bool checkAMDGPUFlatWorkGroupSizeArguments(Sema &S, Expr *MinExpr, Expr *MaxExpr, const AMDGPUFlatWorkGroupSizeAttr &Attr) { diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index f8e5f3c6d309d..0cdace25aa792 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3332,9 +3332,7 @@ static void mergeParamDeclTypes(ParmVarDecl *NewParam, } } else { QualType NewT = NewParam->getType(); - NewT = S.Context.getAttributedType( - AttributedType::getNullabilityAttrKind(*Oldnullability), - NewT, NewT); + NewT = S.Context.getAttributedType(*Oldnullability, NewT, NewT); NewParam->setType(NewT); } } @@ -17957,6 +17955,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, << Name; Invalid = true; } + if (TUK == TagUseKind::Declaration) + Invalid = true; } else if (!PrevDecl) { Diag(Loc, diag::warn_decl_in_param_list) << Context.getTagDeclType(New); } diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp index 78acfeddb7863..431f267fd5be5 100644 --- a/clang/lib/Sema/SemaDeclObjC.cpp +++ b/clang/lib/Sema/SemaDeclObjC.cpp @@ -4572,9 +4572,7 @@ static QualType mergeTypeNullabilityForRedecl(Sema &S, SourceLocation loc, return type; // Otherwise, provide the result with the same nullability. - return S.Context.getAttributedType( - AttributedType::getNullabilityAttrKind(*prevNullability), - type, type); + return S.Context.getAttributedType(*prevNullability, type, type); } /// Merge information from the declaration of a method in the \@interface @@ -4720,13 +4718,67 @@ static void checkObjCDirectMethodClashes(Sema &S, ObjCInterfaceDecl *IDecl, diagClash(IMD); } +ParmVarDecl *SemaObjC::ActOnMethodParmDeclaration(Scope *S, + ObjCArgInfo &ArgInfo, + int ParamIndex, + bool MethodDefinition) { + ASTContext &Context = getASTContext(); + QualType ArgType; + TypeSourceInfo *DI; + + if (!ArgInfo.Type) { + ArgType = Context.getObjCIdType(); + DI = nullptr; + } else { + ArgType = SemaRef.GetTypeFromParser(ArgInfo.Type, &DI); + } + LookupResult R(SemaRef, ArgInfo.Name, ArgInfo.NameLoc, + Sema::LookupOrdinaryName, + SemaRef.forRedeclarationInCurContext()); + SemaRef.LookupName(R, S); + if (R.isSingleResult()) { + NamedDecl *PrevDecl = R.getFoundDecl(); + if (S->isDeclScope(PrevDecl)) { + Diag(ArgInfo.NameLoc, + (MethodDefinition ? diag::warn_method_param_redefinition + : diag::warn_method_param_declaration)) + << ArgInfo.Name; + Diag(PrevDecl->getLocation(), diag::note_previous_declaration); + } + } + SourceLocation StartLoc = + DI ? DI->getTypeLoc().getBeginLoc() : ArgInfo.NameLoc; + + // Temporarily put parameter variables in the translation unit. This is what + // ActOnParamDeclarator does in the case of C arguments to the Objective-C + // method too. + ParmVarDecl *Param = SemaRef.CheckParameter( + Context.getTranslationUnitDecl(), StartLoc, ArgInfo.NameLoc, ArgInfo.Name, + ArgType, DI, SC_None); + Param->setObjCMethodScopeInfo(ParamIndex); + Param->setObjCDeclQualifier( + CvtQTToAstBitMask(ArgInfo.DeclSpec.getObjCDeclQualifier())); + + // Apply the attributes to the parameter. + SemaRef.ProcessDeclAttributeList(SemaRef.TUScope, Param, ArgInfo.ArgAttrs); + SemaRef.AddPragmaAttributes(SemaRef.TUScope, Param); + if (Param->hasAttr()) { + Diag(Param->getLocation(), diag::err_block_on_nonlocal); + Param->setInvalidDecl(); + } + + S->AddDecl(Param); + SemaRef.IdResolver.AddDecl(Param); + return Param; +} + Decl *SemaObjC::ActOnMethodDeclaration( Scope *S, SourceLocation MethodLoc, SourceLocation EndLoc, tok::TokenKind MethodType, ObjCDeclSpec &ReturnQT, ParsedType ReturnType, ArrayRef SelectorLocs, Selector Sel, // optional arguments. The number of types/arguments is obtained // from the Sel.getNumArgs(). - ObjCArgInfo *ArgInfo, DeclaratorChunk::ParamInfo *CParamInfo, + ParmVarDecl **ArgInfo, DeclaratorChunk::ParamInfo *CParamInfo, unsigned CNumArgs, // c-style args const ParsedAttributesView &AttrList, tok::ObjCKeywordKind MethodDeclKind, bool isVariadic, bool MethodDefinition) { @@ -4768,60 +4820,10 @@ Decl *SemaObjC::ActOnMethodDeclaration( HasRelatedResultType); SmallVector Params; - - for (unsigned i = 0, e = Sel.getNumArgs(); i != e; ++i) { - QualType ArgType; - TypeSourceInfo *DI; - - if (!ArgInfo[i].Type) { - ArgType = Context.getObjCIdType(); - DI = nullptr; - } else { - ArgType = SemaRef.GetTypeFromParser(ArgInfo[i].Type, &DI); - } - - LookupResult R(SemaRef, ArgInfo[i].Name, ArgInfo[i].NameLoc, - Sema::LookupOrdinaryName, - SemaRef.forRedeclarationInCurContext()); - SemaRef.LookupName(R, S); - if (R.isSingleResult()) { - NamedDecl *PrevDecl = R.getFoundDecl(); - if (S->isDeclScope(PrevDecl)) { - Diag(ArgInfo[i].NameLoc, - (MethodDefinition ? diag::warn_method_param_redefinition - : diag::warn_method_param_declaration)) - << ArgInfo[i].Name; - Diag(PrevDecl->getLocation(), - diag::note_previous_declaration); - } - } - - SourceLocation StartLoc = DI - ? DI->getTypeLoc().getBeginLoc() - : ArgInfo[i].NameLoc; - - ParmVarDecl *Param = - SemaRef.CheckParameter(ObjCMethod, StartLoc, ArgInfo[i].NameLoc, - ArgInfo[i].Name, ArgType, DI, SC_None); - - Param->setObjCMethodScopeInfo(i); - - Param->setObjCDeclQualifier( - CvtQTToAstBitMask(ArgInfo[i].DeclSpec.getObjCDeclQualifier())); - - // Apply the attributes to the parameter. - SemaRef.ProcessDeclAttributeList(SemaRef.TUScope, Param, - ArgInfo[i].ArgAttrs); - SemaRef.AddPragmaAttributes(SemaRef.TUScope, Param); + for (unsigned I = 0; I < Sel.getNumArgs(); ++I) { + ParmVarDecl *Param = ArgInfo[I]; + Param->setDeclContext(ObjCMethod); SemaRef.ProcessAPINotes(Param); - - if (Param->hasAttr()) { - Diag(Param->getLocation(), diag::err_block_on_nonlocal); - Param->setInvalidDecl(); - } - S->AddDecl(Param); - SemaRef.IdResolver.AddDecl(Param); - Params.push_back(Param); } diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index ff6616901016a..7f3cff1054aee 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -8757,8 +8757,7 @@ static QualType computeConditionalNullability(QualType ResTy, bool IsBin, ResTy = ResTy.getSingleStepDesugaredType(Ctx); // Create a new AttributedType with the new nullability kind. - auto NewAttr = AttributedType::getNullabilityAttrKind(MergedKind); - return Ctx.getAttributedType(NewAttr, ResTy, ResTy); + return Ctx.getAttributedType(MergedKind, ResTy, ResTy); } ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc, diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp index 35fbc4e7c30eb..3fcbbb417ff1f 100644 --- a/clang/lib/Sema/SemaExprObjC.cpp +++ b/clang/lib/Sema/SemaExprObjC.cpp @@ -551,9 +551,7 @@ ExprResult SemaObjC::BuildObjCBoxedExpr(SourceRange SR, Expr *ValueExpr) { const llvm::UTF8 *StrEnd = Str.bytes_end(); // Check that this is a valid UTF-8 string. if (llvm::isLegalUTF8String(&StrBegin, StrEnd)) { - BoxedType = Context.getAttributedType( - AttributedType::getNullabilityAttrKind( - NullabilityKind::NonNull), + BoxedType = Context.getAttributedType(NullabilityKind::NonNull, NSStringPointer, NSStringPointer); return new (Context) ObjCBoxedExpr(CE, BoxedType, nullptr, SR); } @@ -605,9 +603,8 @@ ExprResult SemaObjC::BuildObjCBoxedExpr(SourceRange SR, Expr *ValueExpr) { std::optional Nullability = BoxingMethod->getReturnType()->getNullability(); if (Nullability) - BoxedType = Context.getAttributedType( - AttributedType::getNullabilityAttrKind(*Nullability), BoxedType, - BoxedType); + BoxedType = + Context.getAttributedType(*Nullability, BoxedType, BoxedType); } } else if (ValueType->isBuiltinType()) { // The other types we support are numeric, char and BOOL/bool. We could also @@ -1444,10 +1441,8 @@ static QualType stripObjCInstanceType(ASTContext &Context, QualType T) { QualType origType = T; if (auto nullability = AttributedType::stripOuterNullability(T)) { if (T == Context.getObjCInstanceType()) { - return Context.getAttributedType( - AttributedType::getNullabilityAttrKind(*nullability), - Context.getObjCIdType(), - Context.getObjCIdType()); + return Context.getAttributedType(*nullability, Context.getObjCIdType(), + Context.getObjCIdType()); } return origType; @@ -1485,10 +1480,7 @@ static QualType getBaseMessageSendResultType(Sema &S, (void)AttributedType::stripOuterNullability(type); // Form a new attributed type using the method result type's nullability. - return Context.getAttributedType( - AttributedType::getNullabilityAttrKind(*nullability), - type, - type); + return Context.getAttributedType(*nullability, type, type); } return type; @@ -1559,9 +1551,8 @@ QualType SemaObjC::getMessageSendResultType(const Expr *Receiver, QualType NewResultType = Context.getObjCObjectPointerType( Context.getObjCInterfaceType(MD->getClassInterface())); if (auto Nullability = resultType->getNullability()) - NewResultType = Context.getAttributedType( - AttributedType::getNullabilityAttrKind(*Nullability), - NewResultType, NewResultType); + NewResultType = Context.getAttributedType(*Nullability, NewResultType, + NewResultType); return NewResultType; } } @@ -1623,9 +1614,7 @@ QualType SemaObjC::getMessageSendResultType(const Expr *Receiver, if (newResultNullabilityIdx > 0) { auto newNullability = static_cast(newResultNullabilityIdx-1); - return Context.getAttributedType( - AttributedType::getNullabilityAttrKind(newNullability), - resultType, resultType); + return Context.getAttributedType(newNullability, resultType, resultType); } return resultType; diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp index 5c4dc93ff16cd..6cef76fbde1d2 100644 --- a/clang/lib/Sema/SemaObjCProperty.cpp +++ b/clang/lib/Sema/SemaObjCProperty.cpp @@ -2460,7 +2460,7 @@ void SemaObjC::ProcessPropertyDecl(ObjCPropertyDecl *property) { QualType modifiedTy = resultTy; if (auto nullability = AttributedType::stripOuterNullability(modifiedTy)) { if (*nullability == NullabilityKind::Unspecified) - resultTy = Context.getAttributedType(attr::TypeNonNull, + resultTy = Context.getAttributedType(NullabilityKind::NonNull, modifiedTy, modifiedTy); } } @@ -2538,7 +2538,7 @@ void SemaObjC::ProcessPropertyDecl(ObjCPropertyDecl *property) { QualType modifiedTy = paramTy; if (auto nullability = AttributedType::stripOuterNullability(modifiedTy)){ if (*nullability == NullabilityKind::Unspecified) - paramTy = Context.getAttributedType(attr::TypeNullable, + paramTy = Context.getAttributedType(NullabilityKind::Nullable, modifiedTy, modifiedTy); } } diff --git a/clang/lib/Sema/SemaSwift.cpp b/clang/lib/Sema/SemaSwift.cpp index 2eebce74b5e2f..24fdfb8e57dc3 100644 --- a/clang/lib/Sema/SemaSwift.cpp +++ b/clang/lib/Sema/SemaSwift.cpp @@ -73,11 +73,16 @@ static bool isValidSwiftErrorResultType(QualType Ty) { } void SemaSwift::handleAttrAttr(Decl *D, const ParsedAttr &AL) { + if (AL.isInvalid() || AL.isUsedAsTypeAttr()) + return; + // Make sure that there is a string literal as the annotation's single // argument. StringRef Str; - if (!SemaRef.checkStringLiteralArgumentAttr(AL, 0, Str)) + if (!SemaRef.checkStringLiteralArgumentAttr(AL, 0, Str)) { + AL.setInvalid(); return; + } D->addAttr(::new (getASTContext()) SwiftAttrAttr(getASTContext(), AL, Str)); } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index fcf05798d9c70..4503e60cff8c2 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -4381,8 +4381,20 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc, SmallVector PartialSpecs; Template->getPartialSpecializations(PartialSpecs); - for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) { - VarTemplatePartialSpecializationDecl *Partial = PartialSpecs[I]; + for (VarTemplatePartialSpecializationDecl *Partial : PartialSpecs) { + // C++ [temp.spec.partial.member]p2: + // If the primary member template is explicitly specialized for a given + // (implicit) specialization of the enclosing class template, the partial + // specializations of the member template are ignored for this + // specialization of the enclosing class template. If a partial + // specialization of the member template is explicitly specialized for a + // given (implicit) specialization of the enclosing class template, the + // primary member template and its other partial specializations are still + // considered for this specialization of the enclosing class template. + if (Template->getMostRecentDecl()->isMemberSpecialization() && + !Partial->getMostRecentDecl()->isMemberSpecialization()) + continue; + TemplateDeductionInfo Info(FailedCandidates.getLocation()); if (TemplateDeductionResult Result = diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index dea97bfce532c..b63063813f1b5 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -3978,11 +3978,24 @@ bool Sema::usesPartialOrExplicitSpecialization( return true; SmallVector PartialSpecs; - ClassTemplateSpec->getSpecializedTemplate() - ->getPartialSpecializations(PartialSpecs); - for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) { + ClassTemplateDecl *CTD = ClassTemplateSpec->getSpecializedTemplate(); + CTD->getPartialSpecializations(PartialSpecs); + for (ClassTemplatePartialSpecializationDecl *CTPSD : PartialSpecs) { + // C++ [temp.spec.partial.member]p2: + // If the primary member template is explicitly specialized for a given + // (implicit) specialization of the enclosing class template, the partial + // specializations of the member template are ignored for this + // specialization of the enclosing class template. If a partial + // specialization of the member template is explicitly specialized for a + // given (implicit) specialization of the enclosing class template, the + // primary member template and its other partial specializations are still + // considered for this specialization of the enclosing class template. + if (CTD->getMostRecentDecl()->isMemberSpecialization() && + !CTPSD->getMostRecentDecl()->isMemberSpecialization()) + continue; + TemplateDeductionInfo Info(Loc); - if (DeduceTemplateArguments(PartialSpecs[I], + if (DeduceTemplateArguments(CTPSD, ClassTemplateSpec->getTemplateArgs().asArray(), Info) == TemplateDeductionResult::Success) return true; @@ -4025,8 +4038,21 @@ getPatternForClassTemplateSpecialization( SmallVector PartialSpecs; Template->getPartialSpecializations(PartialSpecs); TemplateSpecCandidateSet FailedCandidates(PointOfInstantiation); - for (unsigned I = 0, N = PartialSpecs.size(); I != N; ++I) { - ClassTemplatePartialSpecializationDecl *Partial = PartialSpecs[I]; + for (ClassTemplatePartialSpecializationDecl *Partial : PartialSpecs) { + // C++ [temp.spec.partial.member]p2: + // If the primary member template is explicitly specialized for a given + // (implicit) specialization of the enclosing class template, the + // partial specializations of the member template are ignored for this + // specialization of the enclosing class template. If a partial + // specialization of the member template is explicitly specialized for a + // given (implicit) specialization of the enclosing class template, the + // primary member template and its other partial specializations are + // still considered for this specialization of the enclosing class + // template. + if (Template->getMostRecentDecl()->isMemberSpecialization() && + !Partial->getMostRecentDecl()->isMemberSpecialization()) + continue; + TemplateDeductionInfo Info(FailedCandidates.getLocation()); if (TemplateDeductionResult Result = S.DeduceTemplateArguments( Partial, ClassTemplateSpec->getTemplateArgs().asArray(), Info); diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 5d043e6684573..e526a11973975 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -289,7 +289,7 @@ namespace { QualType getAttributedType(Attr *A, QualType ModifiedType, QualType EquivType) { QualType T = - sema.Context.getAttributedType(A->getKind(), ModifiedType, EquivType); + sema.Context.getAttributedType(A, ModifiedType, EquivType); AttrsForTypes.push_back({cast(T.getTypePtr()), A}); AttrsForTypesSorted = false; return T; @@ -7161,6 +7161,60 @@ static bool HandleWebAssemblyFuncrefAttr(TypeProcessingState &State, return false; } +static void HandleSwiftAttr(TypeProcessingState &State, TypeAttrLocation TAL, + QualType &QT, ParsedAttr &PAttr) { + if (TAL == TAL_DeclName) + return; + + Sema &S = State.getSema(); + auto &D = State.getDeclarator(); + + // If the attribute appears in declaration specifiers + // it should be handled as a declaration attribute, + // unless it's associated with a type or a function + // prototype (i.e. appears on a parameter or result type). + if (State.isProcessingDeclSpec()) { + if (!(D.isPrototypeContext() || + D.getContext() == DeclaratorContext::TypeName)) + return; + + if (auto *chunk = D.getInnermostNonParenChunk()) { + moveAttrFromListToList(PAttr, State.getCurrentAttributes(), + const_cast(chunk)->getAttrs()); + return; + } + } + + StringRef Str; + if (!S.checkStringLiteralArgumentAttr(PAttr, 0, Str)) { + PAttr.setInvalid(); + return; + } + + // If the attribute as attached to a paren move it closer to + // the declarator. This can happen in block declarations when + // an attribute is placed before `^` i.e. `(__attribute__((...)) ^)`. + // + // Note that it's actually invalid to use GNU style attributes + // in a block but such cases are currently handled gracefully + // but the parser and behavior should be consistent between + // cases when attribute appears before/after block's result + // type and inside (^). + if (TAL == TAL_DeclChunk) { + auto chunkIdx = State.getCurrentChunkIndex(); + if (chunkIdx >= 1 && + D.getTypeObject(chunkIdx).Kind == DeclaratorChunk::Paren) { + moveAttrFromListToList(PAttr, State.getCurrentAttributes(), + D.getTypeObject(chunkIdx - 1).getAttrs()); + return; + } + } + + auto *A = ::new (S.Context) SwiftAttrAttr(S.Context, PAttr, Str); + QT = State.getAttributedType(A, QT, QT); + PAttr.setUsedAsTypeAttr(); +} + /// Rebuild an attributed type without the nullability attribute on it. static QualType rebuildAttributedTypeWithoutNullability(ASTContext &Ctx, QualType Type) { @@ -7177,7 +7231,8 @@ static QualType rebuildAttributedTypeWithoutNullability(ASTContext &Ctx, Ctx, Attributed->getModifiedType()); assert(Modified.getTypePtr() != Attributed->getModifiedType().getTypePtr()); return Ctx.getAttributedType(Attributed->getAttrKind(), Modified, - Attributed->getEquivalentType()); + Attributed->getEquivalentType(), + Attributed->getAttr()); } /// Map a nullability attribute kind to a nullability kind. @@ -7306,8 +7361,7 @@ static bool CheckNullabilityTypeSpecifier( Attr *A = createNullabilityAttr(S.Context, *PAttr, Nullability); QT = State->getAttributedType(A, QT, QT); } else { - attr::Kind attrKind = AttributedType::getNullabilityAttrKind(Nullability); - QT = S.Context.getAttributedType(attrKind, QT, QT); + QT = S.Context.getAttributedType(Nullability, QT, QT); } return false; } @@ -8749,6 +8803,11 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type, break; } + case ParsedAttr::AT_SwiftAttr: { + HandleSwiftAttr(state, TAL, type, attr); + break; + } + MS_TYPE_ATTRS_CASELIST: if (!handleMSPointerTypeQualifierAttr(state, attr, type)) attr.setUsedAsTypeAttr(); diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 6a4d78f0ca908..ef878d16d445f 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -631,6 +631,10 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tileloaddt164: case X86::BI__builtin_ia32_tilestored64: case X86::BI__builtin_ia32_tilezero: + case X86::BI__builtin_ia32_t2rpntlvwz0: + case X86::BI__builtin_ia32_t2rpntlvwz0t1: + case X86::BI__builtin_ia32_t2rpntlvwz1: + case X86::BI__builtin_ia32_t2rpntlvwz1t1: return CheckBuiltinTileArgumentsRange(TheCall, 0); case X86::BI__builtin_ia32_tdpbssd: case X86::BI__builtin_ia32_tdpbsud: @@ -640,7 +644,13 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tdpfp16ps: case X86::BI__builtin_ia32_tcmmimfp16ps: case X86::BI__builtin_ia32_tcmmrlfp16ps: + case X86::BI__builtin_ia32_tdpbf8ps: + case X86::BI__builtin_ia32_tdpbhf8ps: + case X86::BI__builtin_ia32_tdphbf8ps: + case X86::BI__builtin_ia32_tdphf8ps: return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); + case X86::BI__builtin_ia32_ttransposed: + return CheckBuiltinTileArgumentsRange(TheCall, {0, 1}); } } static bool isX86_32Builtin(unsigned BuiltinID) { diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index d24d8d5335e28..aa34300dfa58d 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -7435,7 +7435,8 @@ QualType TreeTransform::TransformAttributedType(TypeLocBuilder &TLB, result = SemaRef.Context.getAttributedType(TL.getAttrKind(), modifiedType, - equivalentType); + equivalentType, + TL.getAttr()); } AttributedTypeLoc newTL = TLB.push(result); diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 8d8f9378cfeab..004a584ff77b4 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -2042,19 +2042,15 @@ ASTReader::getGlobalPreprocessedEntityID(ModuleFile &M, return LocalID + I->second; } -const FileEntry *HeaderFileInfoTrait::getFile(const internal_key_type &Key) { +OptionalFileEntryRef +HeaderFileInfoTrait::getFile(const internal_key_type &Key) { FileManager &FileMgr = Reader.getFileManager(); - if (!Key.Imported) { - if (auto File = FileMgr.getOptionalFileRef(Key.Filename)) - return *File; - return nullptr; - } + if (!Key.Imported) + return FileMgr.getOptionalFileRef(Key.Filename); - std::string Resolved = std::string(Key.Filename); - Reader.ResolveImportedPath(M, Resolved); - if (auto File = FileMgr.getOptionalFileRef(Resolved)) - return *File; - return nullptr; + auto Resolved = + ASTReader::ResolveImportedPath(Reader.getPathBuf(), Key.Filename, M); + return FileMgr.getOptionalFileRef(*Resolved); } unsigned HeaderFileInfoTrait::ComputeHash(internal_key_ref ikey) { @@ -2080,8 +2076,8 @@ bool HeaderFileInfoTrait::EqualKey(internal_key_ref a, internal_key_ref b) { return true; // Determine whether the actual files are equivalent. - const FileEntry *FEA = getFile(a); - const FileEntry *FEB = getFile(b); + OptionalFileEntryRef FEA = getFile(a); + OptionalFileEntryRef FEB = getFile(b); return FEA && FEA == FEB; } @@ -2112,27 +2108,20 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d, HeaderFileInfo HFI; unsigned Flags = *d++; + OptionalFileEntryRef FE; bool Included = (Flags >> 6) & 0x01; if (Included) - if (const FileEntry *FE = getFile(key)) + if ((FE = getFile(key))) // Not using \c Preprocessor::markIncluded(), since that would attempt to // deserialize this header file info again. - Reader.getPreprocessor().getIncludedFiles().insert(FE); + Reader.getPreprocessor().getIncludedFiles().insert(*FE); // FIXME: Refactor with mergeHeaderFileInfo in HeaderSearch.cpp. HFI.isImport |= (Flags >> 5) & 0x01; HFI.isPragmaOnce |= (Flags >> 4) & 0x01; HFI.DirInfo = (Flags >> 1) & 0x07; - HFI.IndexHeaderMapHeader = Flags & 0x01; HFI.LazyControllingMacro = Reader.getGlobalIdentifierID( M, endian::readNext(d)); - if (unsigned FrameworkOffset = - endian::readNext(d)) { - // The framework offset is 1 greater than the actual offset, - // since 0 is used as an indicator for "no framework name". - StringRef FrameworkName(FrameworkStrings + FrameworkOffset - 1); - HFI.Framework = HS->getUniqueFrameworkName(FrameworkName); - } assert((End - d) % 4 == 0 && "Wrong data length in HeaderFileInfo deserialization"); @@ -2146,14 +2135,10 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d, // implicit module import. SubmoduleID GlobalSMID = Reader.getGlobalSubmoduleID(M, LocalSMID); Module *Mod = Reader.getSubmodule(GlobalSMID); - FileManager &FileMgr = Reader.getFileManager(); ModuleMap &ModMap = Reader.getPreprocessor().getHeaderSearchInfo().getModuleMap(); - std::string Filename = std::string(key.Filename); - if (key.Imported) - Reader.ResolveImportedPath(M, Filename); - if (auto FE = FileMgr.getOptionalFileRef(Filename)) { + if (FE || (FE = getFile(key))) { // FIXME: NameAsWritten Module::Header H = {std::string(key.Filename), "", *FE}; ModMap.addHeader(Mod, H, HeaderRole, /*Imported=*/true); @@ -2520,11 +2505,12 @@ InputFileInfo ASTReader::getInputFileInfo(ModuleFile &F, unsigned ID) { std::tie(R.FilenameAsRequested, R.Filename) = [&]() { uint16_t AsRequestedLength = Record[7]; - std::string NameAsRequested = Blob.substr(0, AsRequestedLength).str(); - std::string Name = Blob.substr(AsRequestedLength).str(); + StringRef NameAsRequestedRef = Blob.substr(0, AsRequestedLength); + StringRef NameRef = Blob.substr(AsRequestedLength); - ResolveImportedPath(F, NameAsRequested); - ResolveImportedPath(F, Name); + std::string NameAsRequested = + ResolveImportedPathAndAllocate(PathBuf, NameAsRequestedRef, F); + std::string Name = ResolveImportedPathAndAllocate(PathBuf, NameRef, F); if (Name.empty()) Name = NameAsRequested; @@ -2750,23 +2736,38 @@ InputFile ASTReader::getInputFile(ModuleFile &F, unsigned ID, bool Complain) { return IF; } -/// If we are loading a relocatable PCH or module file, and the filename -/// is not an absolute path, add the system or module root to the beginning of -/// the file name. -void ASTReader::ResolveImportedPath(ModuleFile &M, std::string &Filename) { - // Resolve relative to the base directory, if we have one. - if (!M.BaseDirectory.empty()) - return ResolveImportedPath(Filename, M.BaseDirectory); +ASTReader::TemporarilyOwnedStringRef +ASTReader::ResolveImportedPath(SmallString<0> &Buf, StringRef Path, + ModuleFile &ModF) { + return ResolveImportedPath(Buf, Path, ModF.BaseDirectory); } -void ASTReader::ResolveImportedPath(std::string &Filename, StringRef Prefix) { - if (Filename.empty() || llvm::sys::path::is_absolute(Filename) || - Filename == "" || Filename == "") - return; +ASTReader::TemporarilyOwnedStringRef +ASTReader::ResolveImportedPath(SmallString<0> &Buf, StringRef Path, + StringRef Prefix) { + assert(Buf.capacity() != 0 && "Overlapping ResolveImportedPath calls"); + + if (Prefix.empty() || Path.empty() || llvm::sys::path::is_absolute(Path) || + Path == "" || Path == "") + return {Path, Buf}; - SmallString<128> Buffer; - llvm::sys::path::append(Buffer, Prefix, Filename); - Filename.assign(Buffer.begin(), Buffer.end()); + Buf.clear(); + llvm::sys::path::append(Buf, Prefix, Path); + StringRef ResolvedPath{Buf.data(), Buf.size()}; + return {ResolvedPath, Buf}; +} + +std::string ASTReader::ResolveImportedPathAndAllocate(SmallString<0> &Buf, + StringRef P, + ModuleFile &ModF) { + return ResolveImportedPathAndAllocate(Buf, P, ModF.BaseDirectory); +} + +std::string ASTReader::ResolveImportedPathAndAllocate(SmallString<0> &Buf, + StringRef P, + StringRef Prefix) { + auto ResolvedPath = ResolveImportedPath(Buf, P, Prefix); + return ResolvedPath->str(); } static bool isDiagnosedResult(ASTReader::ASTReadResult ARR, unsigned Caps) { @@ -3194,8 +3195,8 @@ ASTReader::ReadControlBlock(ModuleFile &F, case ORIGINAL_FILE: F.OriginalSourceFileID = FileID::get(Record[0]); F.ActualOriginalSourceFileName = std::string(Blob); - F.OriginalSourceFileName = F.ActualOriginalSourceFileName; - ResolveImportedPath(F, F.OriginalSourceFileName); + F.OriginalSourceFileName = ResolveImportedPathAndAllocate( + PathBuf, F.ActualOriginalSourceFileName, F); break; case ORIGINAL_FILE_ID: @@ -3894,13 +3895,10 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, F.HeaderFileInfoTableData = Blob.data(); F.LocalNumHeaderFileInfos = Record[1]; if (Record[0]) { - F.HeaderFileInfoTable - = HeaderFileInfoLookupTable::Create( - (const unsigned char *)F.HeaderFileInfoTableData + Record[0], - (const unsigned char *)F.HeaderFileInfoTableData, - HeaderFileInfoTrait(*this, F, - &PP.getHeaderSearchInfo(), - Blob.data() + Record[2])); + F.HeaderFileInfoTable = HeaderFileInfoLookupTable::Create( + (const unsigned char *)F.HeaderFileInfoTableData + Record[0], + (const unsigned char *)F.HeaderFileInfoTableData, + HeaderFileInfoTrait(*this, F)); PP.getHeaderSearchInfo().SetExternalSource(this); if (!PP.getHeaderSearchInfo().getExternalLookup()) @@ -5484,6 +5482,8 @@ bool ASTReader::readASTFileControlBlock( RecordData Record; std::string ModuleDir; bool DoneWithControlBlock = false; + SmallString<0> PathBuf; + PathBuf.reserve(256); while (!DoneWithControlBlock) { Expected MaybeEntry = Stream.advance(); if (!MaybeEntry) { @@ -5566,9 +5566,9 @@ bool ASTReader::readASTFileControlBlock( break; case MODULE_MAP_FILE: { unsigned Idx = 0; - auto Path = ReadString(Record, Idx); - ResolveImportedPath(Path, ModuleDir); - Listener.ReadModuleMapFile(Path); + std::string PathStr = ReadString(Record, Idx); + auto Path = ResolveImportedPath(PathBuf, PathStr, ModuleDir); + Listener.ReadModuleMapFile(*Path); break; } case INPUT_FILE_OFFSETS: { @@ -5615,10 +5615,9 @@ bool ASTReader::readASTFileControlBlock( break; case INPUT_FILE: bool Overridden = static_cast(Record[3]); - std::string Filename = std::string(Blob); - ResolveImportedPath(Filename, ModuleDir); + auto Filename = ResolveImportedPath(PathBuf, Blob, ModuleDir); shouldContinue = Listener.visitInputFile( - Filename, isSystemFile, Overridden, /*IsExplicitModule*/false); + *Filename, isSystemFile, Overridden, /*IsExplicitModule=*/false); break; } if (!shouldContinue) @@ -5653,9 +5652,9 @@ bool ASTReader::readASTFileControlBlock( // Skip Size, ModTime and Signature Idx += 1 + 1 + ASTFileSignature::size; std::string ModuleName = ReadString(Record, Idx); - std::string Filename = ReadString(Record, Idx); - ResolveImportedPath(Filename, ModuleDir); - Listener.visitImport(ModuleName, Filename); + std::string FilenameStr = ReadString(Record, Idx); + auto Filename = ResolveImportedPath(PathBuf, FilenameStr, ModuleDir); + Listener.visitImport(ModuleName, *Filename); } break; } @@ -5908,9 +5907,8 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, // FIXME: This doesn't work for framework modules as `Filename` is the // name as written in the module file and does not include // `Headers/`, so this path will never exist. - std::string Filename = std::string(Blob); - ResolveImportedPath(F, Filename); - if (auto Umbrella = PP.getFileManager().getOptionalFileRef(Filename)) { + auto Filename = ResolveImportedPath(PathBuf, Blob, F); + if (auto Umbrella = PP.getFileManager().getOptionalFileRef(*Filename)) { if (!CurrentModule->getUmbrellaHeaderAsWritten()) { // FIXME: NameAsWritten ModMap.setUmbrellaHeaderAsWritten(CurrentModule, *Umbrella, Blob, ""); @@ -5938,18 +5936,16 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F, break; case SUBMODULE_TOPHEADER: { - std::string HeaderName(Blob); - ResolveImportedPath(F, HeaderName); - CurrentModule->addTopHeaderFilename(HeaderName); + auto HeaderName = ResolveImportedPath(PathBuf, Blob, F); + CurrentModule->addTopHeaderFilename(*HeaderName); break; } case SUBMODULE_UMBRELLA_DIR: { // See comments in SUBMODULE_UMBRELLA_HEADER - std::string Dirname = std::string(Blob); - ResolveImportedPath(F, Dirname); + auto Dirname = ResolveImportedPath(PathBuf, Blob, F); if (auto Umbrella = - PP.getFileManager().getOptionalDirectoryRef(Dirname)) { + PP.getFileManager().getOptionalDirectoryRef(*Dirname)) { if (!CurrentModule->getUmbrellaDirAsWritten()) { // FIXME: NameAsWritten ModMap.setUmbrellaDirAsWritten(CurrentModule, *Umbrella, Blob, ""); @@ -9604,17 +9600,13 @@ std::string ASTReader::ReadString(const RecordDataImpl &Record, unsigned &Idx) { std::string ASTReader::ReadPath(ModuleFile &F, const RecordData &Record, unsigned &Idx) { - std::string Filename = ReadString(Record, Idx); - ResolveImportedPath(F, Filename); - return Filename; + return ReadPath(F.BaseDirectory, Record, Idx); } std::string ASTReader::ReadPath(StringRef BaseDirectory, const RecordData &Record, unsigned &Idx) { std::string Filename = ReadString(Record, Idx); - if (!BaseDirectory.empty()) - ResolveImportedPath(Filename, BaseDirectory); - return Filename; + return ResolveImportedPathAndAllocate(PathBuf, Filename, BaseDirectory); } VersionTuple ASTReader::ReadVersionTuple(const RecordData &Record, @@ -10519,6 +10511,8 @@ ASTReader::ASTReader(Preprocessor &PP, InMemoryModuleCache &ModuleCache, UseGlobalIndex(UseGlobalIndex), CurrSwitchCaseStmts(&SwitchCaseStmts) { SourceMgr.setExternalSLocEntrySource(this); + PathBuf.reserve(256); + for (const auto &Ext : Extensions) { auto BlockName = Ext->getExtensionMetadata().BlockName; auto Known = ModuleFileExtensions.find(BlockName); diff --git a/clang/lib/Serialization/ASTReaderInternals.h b/clang/lib/Serialization/ASTReaderInternals.h index 536b19f91691e..4f7e6f4b2741b 100644 --- a/clang/lib/Serialization/ASTReaderInternals.h +++ b/clang/lib/Serialization/ASTReaderInternals.h @@ -243,8 +243,6 @@ using ASTSelectorLookupTable = class HeaderFileInfoTrait { ASTReader &Reader; ModuleFile &M; - HeaderSearch *HS; - const char *FrameworkStrings; public: using external_key_type = FileEntryRef; @@ -262,9 +260,8 @@ class HeaderFileInfoTrait { using hash_value_type = unsigned; using offset_type = unsigned; - HeaderFileInfoTrait(ASTReader &Reader, ModuleFile &M, HeaderSearch *HS, - const char *FrameworkStrings) - : Reader(Reader), M(M), HS(HS), FrameworkStrings(FrameworkStrings) {} + HeaderFileInfoTrait(ASTReader &Reader, ModuleFile &M) + : Reader(Reader), M(M) {} static hash_value_type ComputeHash(internal_key_ref ikey); internal_key_type GetInternalKey(external_key_type ekey); @@ -278,7 +275,7 @@ class HeaderFileInfoTrait { data_type ReadData(internal_key_ref,const unsigned char *d, unsigned DataLen); private: - const FileEntry *getFile(const internal_key_type &Key); + OptionalFileEntryRef getFile(const internal_key_type &Key); }; /// The on-disk hash table used for known header files. diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 569c688f793d8..732c7ef01c0db 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -1958,10 +1958,6 @@ namespace { class HeaderFileInfoTrait { ASTWriter &Writer; - // Keep track of the framework names we've used during serialization. - SmallString<128> FrameworkStringData; - llvm::StringMap FrameworkNameOffset; - public: HeaderFileInfoTrait(ASTWriter &Writer) : Writer(Writer) {} @@ -2005,7 +2001,7 @@ namespace { std::pair EmitKeyDataLength(raw_ostream& Out, key_type_ref key, data_type_ref Data) { unsigned KeyLen = key.Filename.size() + 1 + 8 + 8; - unsigned DataLen = 1 + sizeof(IdentifierID) + 4; + unsigned DataLen = 1 + sizeof(IdentifierID); for (auto ModInfo : Data.KnownHeaders) if (Writer.getLocalOrImportedSubmoduleID(ModInfo.getModule())) DataLen += 4; @@ -2036,8 +2032,7 @@ namespace { | (Data.HFI.isImport << 5) | (Writer.isWritingStdCXXNamedModules() ? 0 : Data.HFI.isPragmaOnce << 4) - | (Data.HFI.DirInfo << 1) - | Data.HFI.IndexHeaderMapHeader; + | (Data.HFI.DirInfo << 1); LE.write(Flags); if (Data.HFI.LazyControllingMacro.isID()) @@ -2046,22 +2041,6 @@ namespace { LE.write( Writer.getIdentifierRef(Data.HFI.LazyControllingMacro.getPtr())); - unsigned Offset = 0; - if (!Data.HFI.Framework.empty()) { - // If this header refers into a framework, save the framework name. - llvm::StringMap::iterator Pos - = FrameworkNameOffset.find(Data.HFI.Framework); - if (Pos == FrameworkNameOffset.end()) { - Offset = FrameworkStringData.size() + 1; - FrameworkStringData.append(Data.HFI.Framework); - FrameworkStringData.push_back(0); - - FrameworkNameOffset[Data.HFI.Framework] = Offset; - } else - Offset = Pos->second; - } - LE.write(Offset); - auto EmitModule = [&](Module *M, ModuleMap::ModuleHeaderRole Role) { if (uint32_t ModID = Writer.getLocalOrImportedSubmoduleID(M)) { uint32_t Value = (ModID << 3) | (unsigned)Role; @@ -2077,9 +2056,6 @@ namespace { assert(Out.tell() - Start == DataLen && "Wrong data length"); } - - const char *strings_begin() const { return FrameworkStringData.begin(); } - const char *strings_end() const { return FrameworkStringData.end(); } }; } // namespace @@ -2214,7 +2190,6 @@ void ASTWriter::WriteHeaderSearch(const HeaderSearch &HS) { // Write the header search table RecordData::value_type Record[] = {HEADER_SEARCH_TABLE, BucketOffset, NumHeaderSearchEntries, TableData.size()}; - TableData.append(GeneratorTrait.strings_begin(),GeneratorTrait.strings_end()); Stream.EmitRecordWithBlob(TableAbbrev, Record, TableData); // Free all of the strings we had to duplicate. diff --git a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt index 62aa5ff7f002a..c6e5afdc42424 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt +++ b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt @@ -136,7 +136,7 @@ add_clang_library(clangStaticAnalyzerCheckers WebKit/RefCntblBaseVirtualDtorChecker.cpp WebKit/UncountedCallArgsChecker.cpp WebKit/UncountedLambdaCapturesChecker.cpp - WebKit/UncountedLocalVarsChecker.cpp + WebKit/RawPtrRefLocalVarsChecker.cpp LINK_LIBS clangAST diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp index 3e95db7e97fac..4166cf14391e2 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp @@ -1091,12 +1091,15 @@ static bool isStandardDelete(const FunctionDecl *FD) { if (Kind != OO_Delete && Kind != OO_Array_Delete) return false; + bool HasBody = FD->hasBody(); // Prefer using the definition. + // This is standard if and only if it's not defined in a user file. SourceLocation L = FD->getLocation(); + // If the header for operator delete is not included, it's still defined // in an invalid source location. Check to make sure we don't crash. - return !L.isValid() || - FD->getASTContext().getSourceManager().isInSystemHeader(L); + const auto &SM = FD->getASTContext().getSourceManager(); + return L.isInvalid() || (!HasBody && SM.isInSystemHeader(L)); } //===----------------------------------------------------------------------===// diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 2293dcf1d4bd6..46819d5ca1205 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -200,6 +200,14 @@ std::optional isUncountedPtr(const QualType T) { return false; } +std::optional isUncheckedPtr(const QualType T) { + if (T->isPointerType() || T->isReferenceType()) { + if (auto *CXXRD = T->getPointeeCXXRecordDecl()) + return isUnchecked(CXXRD); + } + return false; +} + std::optional isUnsafePtr(const QualType T) { if (T->isPointerType() || T->isReferenceType()) { if (auto *CXXRD = T->getPointeeCXXRecordDecl()) { diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h index 4b41ca96e1df1..30bdaed706bb5 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h @@ -63,6 +63,10 @@ std::optional isUncounted(const clang::CXXRecordDecl* Class); /// class, false if not, std::nullopt if inconclusive. std::optional isUncountedPtr(const clang::QualType T); +/// \returns true if \p T is either a raw pointer or reference to an unchecked +/// class, false if not, std::nullopt if inconclusive. +std::optional isUncheckedPtr(const clang::QualType T); + /// \returns true if \p T is a RefPtr, Ref, CheckedPtr, CheckedRef, or its /// variant, false if not. bool isSafePtrType(const clang::QualType T); diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp similarity index 87% rename from clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp rename to clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp index 76a4599cc8d78..06f8f43cee815 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp @@ -165,15 +165,18 @@ bool isGuardedScopeEmbeddedInGuardianScope(const VarDecl *Guarded, return false; } -class UncountedLocalVarsChecker +class RawPtrRefLocalVarsChecker : public Checker> { - BugType Bug{this, - "Uncounted raw pointer or reference not provably backed by " - "ref-counted variable", - "WebKit coding guidelines"}; + BugType Bug; mutable BugReporter *BR; public: + RawPtrRefLocalVarsChecker(const char *description) + : Bug(this, description, "WebKit coding guidelines") {} + + virtual std::optional isUnsafePtr(const QualType T) const = 0; + virtual const char *ptrKind() const = 0; + void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR, BugReporter &BRArg) const { BR = &BRArg; @@ -182,14 +185,14 @@ class UncountedLocalVarsChecker // visit template instantiations or lambda classes. We // want to visit those, so we make our own RecursiveASTVisitor. struct LocalVisitor : public RecursiveASTVisitor { - const UncountedLocalVarsChecker *Checker; + const RawPtrRefLocalVarsChecker *Checker; Decl *DeclWithIssue{nullptr}; TrivialFunctionAnalysis TFA; using Base = RecursiveASTVisitor; - explicit LocalVisitor(const UncountedLocalVarsChecker *Checker) + explicit LocalVisitor(const RawPtrRefLocalVarsChecker *Checker) : Checker(Checker) { assert(Checker); } @@ -261,7 +264,7 @@ class UncountedLocalVarsChecker if (shouldSkipVarDecl(V)) return; - std::optional IsUncountedPtr = isUncountedPtr(V->getType()); + std::optional IsUncountedPtr = isUnsafePtr(V->getType()); if (IsUncountedPtr && *IsUncountedPtr) { if (tryToFindPtrOrigin( Value, /*StopAtFirstRefCountedObj=*/false, @@ -324,7 +327,7 @@ class UncountedLocalVarsChecker llvm::raw_svector_ostream Os(Buf); if (dyn_cast(V)) { - Os << "Assignment to an uncounted parameter "; + Os << "Assignment to an " << ptrKind() << " parameter "; printQuotedQualifiedName(Os, V); Os << " is unsafe."; @@ -342,7 +345,7 @@ class UncountedLocalVarsChecker else Os << "Variable "; printQuotedQualifiedName(Os, V); - Os << " is uncounted and unsafe."; + Os << " is " << ptrKind() << " and unsafe."; PathDiagnosticLocation BSLoc(V->getLocation(), BR->getSourceManager()); auto Report = std::make_unique(Bug, Os.str(), BSLoc); @@ -352,6 +355,29 @@ class UncountedLocalVarsChecker } } }; + +class UncountedLocalVarsChecker final : public RawPtrRefLocalVarsChecker { +public: + UncountedLocalVarsChecker() + : RawPtrRefLocalVarsChecker("Uncounted raw pointer or reference not " + "provably backed by ref-counted variable") {} + std::optional isUnsafePtr(const QualType T) const final { + return isUncountedPtr(T); + } + const char *ptrKind() const final { return "uncounted"; } +}; + +class UncheckedLocalVarsChecker final : public RawPtrRefLocalVarsChecker { +public: + UncheckedLocalVarsChecker() + : RawPtrRefLocalVarsChecker("Unchecked raw pointer or reference not " + "provably backed by checked variable") {} + std::optional isUnsafePtr(const QualType T) const final { + return isUncheckedPtr(T); + } + const char *ptrKind() const final { return "unchecked"; } +}; + } // namespace void ento::registerUncountedLocalVarsChecker(CheckerManager &Mgr) { @@ -361,3 +387,11 @@ void ento::registerUncountedLocalVarsChecker(CheckerManager &Mgr) { bool ento::shouldRegisterUncountedLocalVarsChecker(const CheckerManager &) { return true; } + +void ento::registerUncheckedLocalVarsChecker(CheckerManager &Mgr) { + Mgr.registerChecker(); +} + +bool ento::shouldRegisterUncheckedLocalVarsChecker(const CheckerManager &) { + return true; +} diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp index 45e48d435aca6..229169f848e22 100644 --- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp +++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp @@ -860,11 +860,12 @@ SVal SimpleSValBuilder::evalBinOpLL(ProgramStateRef state, // If one of the operands is a symbol and the other is a constant, // build an expression for use by the constraint manager. if (SymbolRef rSym = rhs.getAsLocSymbol()) { - // We can only build expressions with symbols on the left, - // so we need a reversible operator. - if (!BinaryOperator::isComparisonOp(op) || op == BO_Cmp) + if (op == BO_Cmp) return UnknownVal(); + if (!BinaryOperator::isComparisonOp(op)) + return makeNonLoc(L.getValue(), op, rSym, resultTy); + op = BinaryOperator::reverseComparisonOp(op); return makeNonLoc(rSym, op, L.getValue(), resultTy); } diff --git a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp new file mode 100644 index 0000000000000..230680ff3ced7 --- /dev/null +++ b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp @@ -0,0 +1,87 @@ +// RUN: %clang_cc1 -verify=ref,both -std=c++2a -fsyntax-only -triple x86_64-apple-macosx10.14.0 %s +// RUN: %clang_cc1 -verify=ref,both -std=c++2a -fsyntax-only -triple x86_64-apple-macosx10.14.0 %s -fno-signed-char +// RUN: %clang_cc1 -verify=ref,both -std=c++2a -fsyntax-only -triple aarch64_be-linux-gnu %s + +// RUN: %clang_cc1 -verify=expected,both -std=c++2a -fsyntax-only -triple x86_64-apple-macosx10.14.0 %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -verify=expected,both -std=c++2a -fsyntax-only -triple x86_64-apple-macosx10.14.0 %s -fno-signed-char -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -verify=expected,both -std=c++2a -fsyntax-only -triple aarch64_be-linux-gnu %s -fexperimental-new-constant-interpreter + +// both-no-diagnostics + +typedef decltype(nullptr) nullptr_t; +typedef __INTPTR_TYPE__ intptr_t; + +static_assert(sizeof(int) == 4); +static_assert(sizeof(long long) == 8); + +template +constexpr To bit_cast(const From &from) { + static_assert(sizeof(To) == sizeof(From)); + return __builtin_bit_cast(To, from); +} + +template +constexpr bool check_round_trip(const Init &init) { + return bit_cast(bit_cast(init)) == init; +} + +template +constexpr Init round_trip(const Init &init) { + return bit_cast(bit_cast(init)); +} + + + + +namespace test_long_double { +#if __x86_64 +#if 0 +constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // expected-error{{must be initialized by a constant expression}}\ + // expected-note{{in call}} +#endif +constexpr long double ld = 3.1425926539; + +struct bytes { + unsigned char d[16]; +}; + +// static_assert(round_trip(ld), ""); + +static_assert(round_trip(10.0L)); + +#if 0 +constexpr bool f(bool read_uninit) { + bytes b = bit_cast(ld); + unsigned char ld_bytes[10] = { + 0x0, 0x48, 0x9f, 0x49, 0xf0, + 0x3c, 0x20, 0xc9, 0x0, 0x40, + }; + + for (int i = 0; i != 10; ++i) + if (ld_bytes[i] != b.d[i]) + return false; + + if (read_uninit && b.d[10]) // expected-note{{read of uninitialized object is not allowed in a constant expression}} + return false; + + return true; +} + +static_assert(f(/*read_uninit=*/false), ""); +static_assert(f(/*read_uninit=*/true), ""); // expected-error{{static assertion expression is not an integral constant expression}} \ + // expected-note{{in call to 'f(true)'}} +#endif +constexpr bytes ld539 = { + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0xc0, 0x86, + 0x8, 0x40, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, +}; + +constexpr long double fivehundredandthirtynine = 539.0; + +static_assert(bit_cast(ld539) == fivehundredandthirtynine, ""); +#else +static_assert(round_trip<__int128_t>(34.0L)); +#endif +} diff --git a/clang/test/AST/ByteCode/builtin-bit-cast.cpp b/clang/test/AST/ByteCode/builtin-bit-cast.cpp new file mode 100644 index 0000000000000..a48d9549959e9 --- /dev/null +++ b/clang/test/AST/ByteCode/builtin-bit-cast.cpp @@ -0,0 +1,458 @@ +// RUN: %clang_cc1 -verify=ref,both -std=c++2a -fsyntax-only %s +// RUN: %clang_cc1 -verify=ref,both -std=c++2a -fsyntax-only -triple aarch64_be-linux-gnu %s +// RUN: %clang_cc1 -verify=ref,both -std=c++2a -fsyntax-only -triple powerpc64le-unknown-unknown -mabi=ieeelongdouble %s +// RUN: %clang_cc1 -verify=ref,both -std=c++2a -fsyntax-only -triple powerpc64-unknown-unknown -mabi=ieeelongdouble %s + +// RUN: %clang_cc1 -verify=expected,both -std=c++2a -fsyntax-only -fexperimental-new-constant-interpreter %s +// RUN: %clang_cc1 -verify=expected,both -std=c++2a -fsyntax-only -triple aarch64_be-linux-gnu -fexperimental-new-constant-interpreter %s +// RUN: %clang_cc1 -verify=expected,both -std=c++2a -fsyntax-only -fexperimental-new-constant-interpreter -triple powerpc64le-unknown-unknown -mabi=ieeelongdouble %s +// RUN: %clang_cc1 -verify=expected,both -std=c++2a -fsyntax-only -fexperimental-new-constant-interpreter -triple powerpc64-unknown-unknown -mabi=ieeelongdouble %s + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define LITTLE_END 1 +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define LITTLE_END 0 +#else +# error "huh?" +#endif + +typedef decltype(nullptr) nullptr_t; +typedef __INTPTR_TYPE__ intptr_t; + +static_assert(sizeof(int) == 4); +static_assert(sizeof(long long) == 8); + +template +constexpr To bit_cast(const From &from) { + static_assert(sizeof(To) == sizeof(From)); + return __builtin_bit_cast(To, from); +} + +template +constexpr bool check_round_trip(const Init &init) { + return bit_cast(bit_cast(init)) == init; +} + +template +constexpr Init round_trip(const Init &init) { + return bit_cast(bit_cast(init)); +} + +namespace std { +enum byte : unsigned char {}; +} // namespace std + +using uint8_t = unsigned char; + +template +struct bytes { + using size_t = unsigned int; + unsigned char d[N]; + + constexpr unsigned char &operator[](size_t index) { + if (index < N) + return d[index]; + } +}; + + +template +struct bits { + T : Pad; + T bits : N; + + constexpr bool operator==(const T& rhs) const { + return bits == rhs; + } +}; + +template +constexpr bool operator==(const struct bits& lhs, const struct bits& rhs) { + return lhs.bits == rhs.bits; +} + +#ifdef __SIZEOF_INT128__ +static_assert(check_round_trip<__int128_t>((__int128_t)34)); +static_assert(check_round_trip<__int128_t>((__int128_t)-34)); +#endif + +static_assert(check_round_trip(17.0)); + + +namespace simple { + constexpr int A = __builtin_bit_cast(int, 10); + static_assert(A == 10); + + static_assert(__builtin_bit_cast(unsigned, 1.0F) == 1065353216); + + struct Bytes { + char a, b, c, d; + }; + constexpr unsigned B = __builtin_bit_cast(unsigned, Bytes{10, 12, 13, 14}); + static_assert(B == (LITTLE_END ? 235736074 : 168561934)); + + + constexpr unsigned C = __builtin_bit_cast(unsigned, (_BitInt(32))12); + static_assert(C == 12); + + struct BitInts { + _BitInt(16) a; + _BitInt(16) b; + }; + constexpr unsigned D = __builtin_bit_cast(unsigned, BitInts{12, 13}); + static_assert(D == (LITTLE_END ? 851980 : 786445)); + + + + static_assert(__builtin_bit_cast(char, true) == 1); + + static_assert(check_round_trip((int)-1)); + static_assert(check_round_trip((int)0x12345678)); + static_assert(check_round_trip((int)0x87654321)); + static_assert(check_round_trip((int)0x0C05FEFE)); + static_assert(round_trip((int)0x0C05FEFE)); + + + /// This works in GCC and in the bytecode interpreter, but the current interpreter + /// diagnoses it. + static_assert(__builtin_bit_cast(intptr_t, nullptr) == 0); // ref-error {{not an integral constant expression}} \ + // ref-note {{indeterminate value can only initialize an object}} +} + +namespace Fail { + constexpr int a = 1/0; // both-error {{must be initialized by a constant expression}} \ + // both-note {{division by zero}} \ + // both-note {{declared here}} + constexpr int b = __builtin_bit_cast(int, a); // both-error {{must be initialized by a constant expression}} \ + // both-note {{initializer of 'a' is not a constant expression}} +} + +namespace NullPtr { + constexpr nullptr_t N = __builtin_bit_cast(nullptr_t, (intptr_t)1u); + static_assert(N == nullptr); + static_assert(__builtin_bit_cast(nullptr_t, (_BitInt(sizeof(void*) * 8))12) == __builtin_bit_cast(nullptr_t, (unsigned _BitInt(sizeof(void*) * 8))0)); + static_assert(__builtin_bit_cast(nullptr_t, nullptr) == nullptr); +} + +namespace bitint { + constexpr _BitInt(sizeof(int) * 8) BI = ~0; + constexpr unsigned int I = __builtin_bit_cast(unsigned int, BI); + static_assert(I == ~0u, ""); + + constexpr _BitInt(sizeof(int) * 8) IB = __builtin_bit_cast(_BitInt(sizeof(int) * 8), I); // ref-error {{must be initialized by a constant expression}} \ + // ref-note {{constexpr bit cast involving type '_BitInt(32)' is not yet supported}} \ + // ref-note {{declared here}} + static_assert(IB == ~0u, ""); // ref-error {{not an integral constant expression}} \ + // ref-note {{initializer of 'IB' is not a constant expression}} +} + +namespace BitFields { + struct BitFields { + unsigned a : 2; + unsigned b : 30; + }; + + constexpr unsigned A = __builtin_bit_cast(unsigned, BitFields{3, 16}); // ref-error {{must be initialized by a constant expression}} \ + // ref-note {{not yet supported}} \ + // ref-note {{declared here}} + static_assert(A == (LITTLE_END ? 67 : 3221225488)); // ref-error {{not an integral constant expression}} \ + // ref-note {{initializer of 'A'}} + + + void bitfield_indeterminate() { + struct BF { unsigned char z : 2; }; + enum byte : unsigned char {}; + + constexpr BF bf = {0x3}; + /// Requires bitcasts to composite types. + // static_assert(bit_cast>(bf).bits == bf.z); + // static_assert(bit_cast(bf)); + +#if 0 + // static_assert(__builtin_bit_cast(byte, bf)); + + struct M { + // expected-note@+1 {{subobject declared here}} + unsigned char mem[sizeof(BF)]; + }; + // expected-error@+2 {{initialized by a constant expression}} + // expected-note@+1 {{not initialized}} + constexpr M m = bit_cast(bf); + + constexpr auto f = []() constexpr { + // bits<24, unsigned int, LITTLE_END ? 0 : 8> B = {0xc0ffee}; + constexpr struct { unsigned short b1; unsigned char b0; } B = {0xc0ff, 0xee}; + return bit_cast>(B); + }; + + static_assert(f()[0] + f()[1] + f()[2] == 0xc0 + 0xff + 0xee); + { + // expected-error@+2 {{initialized by a constant expression}} + // expected-note@+1 {{read of uninitialized object is not allowed in a constant expression}} + constexpr auto _bad = f()[3]; + } + + struct B { + unsigned short s0 : 8; + unsigned short s1 : 8; + std::byte b0 : 4; + std::byte b1 : 4; + std::byte b2 : 4; + }; + constexpr auto g = [f]() constexpr { + return bit_cast(f()); + }; + static_assert(g().s0 + g().s1 + g().b0 + g().b1 == 0xc0 + 0xff + 0xe + 0xe); + { + // expected-error@+2 {{initialized by a constant expression}} + // expected-note@+1 {{read of uninitialized object is not allowed in a constant expression}} + constexpr auto _bad = g().b2; + } +#endif + } +} + +namespace Classes { + class A { + public: + char a[2]; + }; + class B : public A { + public: + char b[2]; + }; + static_assert(__builtin_bit_cast(int, B{{0, 0},{0, 0}}) == 0); + static_assert(__builtin_bit_cast(int, B{{13, 0},{0, 0}}) == (LITTLE_END ? 13 : 218103808)); + static_assert(__builtin_bit_cast(int, B{{13, 7},{12, 20}}) == (LITTLE_END ? 336332557 : 218565652)); + + class Ref { + public: + const int &a; + constexpr Ref(const int &a) : a(a) {} + }; + constexpr int I = 12; + + typedef __INTPTR_TYPE__ intptr_t; + static_assert(__builtin_bit_cast(intptr_t, Ref{I}) == 0); // both-error {{not an integral constant expression}} \ + // both-note {{bit_cast from a type with a reference member is not allowed in a constant expression}} + + class C : public A { + public: + constexpr C() : A{1,2} {} + virtual constexpr int get() { + return 4; + } + }; + static_assert(__builtin_bit_cast(_BitInt(sizeof(C) * 8), C()) == 0); // both-error {{source type must be trivially copyable}} + + + class D : virtual A {}; + static_assert(__builtin_bit_cast(_BitInt(sizeof(D) * 8), D()) == 0); // both-error {{source type must be trivially copyable}} + + class F { + public: + char f[2]; + }; + + class E : public A, public F { + public: + constexpr E() : A{1,2}, F{3,4}, e{5,6,7,8} {} + char e[4]; + }; + static_assert(__builtin_bit_cast(long long, E()) == (LITTLE_END ? 578437695752307201 : 72623859790382856)); +} + +struct int_splicer { + unsigned x; + unsigned y; + + constexpr int_splicer() : x(1), y(2) {} + constexpr int_splicer(unsigned x, unsigned y) : x(x), y(y) {} + + constexpr bool operator==(const int_splicer &other) const { + return other.x == x && other.y == y; + } +}; + +constexpr int_splicer splice(0x0C05FEFE, 0xCAFEBABE); + +#if 0 +static_assert(bit_cast(splice) == (LITTLE_END + ? 0xCAFEBABE0C05FEFE + : 0x0C05FEFECAFEBABE)); + +constexpr int_splicer IS = bit_cast(0xCAFEBABE0C05FEFE); +static_assert(bit_cast(0xCAFEBABE0C05FEFE).x == (LITTLE_END + ? 0x0C05FEFE + : 0xCAFEBABE)); + +static_assert(round_trip(splice)); +static_assert(round_trip(splice)); +#endif + + + +/// --------------------------------------------------------------------------- +/// From here on, it's things copied from test/SemaCXX/constexpr-builtin-bit.cast.cpp + +void test_int() { + static_assert(round_trip((int)-1)); + static_assert(round_trip((int)0x12345678)); + static_assert(round_trip((int)0x87654321)); + static_assert(round_trip((int)0x0C05FEFE)); +} + +void test_array() { + constexpr unsigned char input[] = {0xCA, 0xFE, 0xBA, 0xBE}; + constexpr unsigned expected = LITTLE_END ? 0xBEBAFECA : 0xCAFEBABE; + static_assert(bit_cast(input) == expected); + + /// Same things but with a composite array. + struct US { unsigned char I; }; + constexpr US input2[] = {{0xCA}, {0xFE}, {0xBA}, {0xBE}}; + static_assert(bit_cast(input2) == expected); +} + +void test_record() { + struct int_splicer { + unsigned x; + unsigned y; + + constexpr bool operator==(const int_splicer &other) const { + return other.x == x && other.y == y; + } + }; + + constexpr int_splicer splice{0x0C05FEFE, 0xCAFEBABE}; + + static_assert(bit_cast(splice) == (LITTLE_END + ? 0xCAFEBABE0C05FEFE + : 0x0C05FEFECAFEBABE)); + + /// FIXME: Bit casts to composite types. + // static_assert(bit_cast(0xCAFEBABE0C05FEFE).x == (LITTLE_END + // ? 0x0C05FEFE + // : 0xCAFEBABE)); + + // static_assert(check_round_trip(splice)); + // static_assert(check_round_trip(splice)); + + struct base2 { + }; + + struct base3 { + unsigned z; + }; + + struct bases : int_splicer, base2, base3 { + unsigned doublez; + }; + + struct tuple4 { + unsigned x, y, z, doublez; + + bool operator==(tuple4 const &other) const = default; + constexpr bool operator==(bases const &other) const { + return x == other.x && y == other.y && + z == other.z && doublez == other.doublez; + } + }; + // constexpr bases b = {{1, 2}, {}, {3}, 4}; + // constexpr tuple4 t4 = bit_cast(b); + // static_assert(t4 == tuple4{1, 2, 3, 4}); + // static_assert(round_trip(b)); + + // constexpr auto b2 = bit_cast(t4); + // static_assert(t4 == b2); +} + +void test_partially_initialized() { + struct pad { + signed char x; + int y; + }; + + struct no_pad { + signed char x; + signed char p1, p2, p3; + int y; + }; + + static_assert(sizeof(pad) == sizeof(no_pad)); + +#if 0 + constexpr pad pir{4, 4}; + constexpr int piw = bit_cast(pir).x; // both-error {{constexpr variable 'piw' must be initialized by a constant expression}} \ + // both-note {{in call to 'bit_cast(pir)'}} + + + constexpr no_pad bad = bit_cast(pir); // both-error {{constexpr variable 'bad' must be initialized by a constant expression}} \ + // both-note {{in call to 'bit_cast(pir)'}} + // constexpr pad fine = bit_cast(no_pad{1, 2, 3, 4, 5}); + // static_assert(fine.x == 1 && fine.y == 5); +#endif +} + + +void bad_types() { + union X { + int x; + }; + static_assert(__builtin_bit_cast(int, X{0}) == 0); // both-error {{not an integral constant expression}} \ + // both-note {{bit_cast from a union type is not allowed in a constant expression}} +#if 0 + + struct G { + int g; + }; + // expected-error@+2 {{constexpr variable 'g' must be initialized by a constant expression}} + // expected-note@+1 {{bit_cast from a union type is not allowed in a constant expression}} + constexpr G g = __builtin_bit_cast(G, X{0}); + // expected-error@+2 {{constexpr variable 'x' must be initialized by a constant expression}} + // expected-note@+1 {{bit_cast to a union type is not allowed in a constant expression}} + constexpr X x = __builtin_bit_cast(X, G{0}); +#endif + struct has_pointer { + int *ptr; // both-note {{invalid type 'int *' is a member of 'has_pointer'}} + }; + + constexpr intptr_t ptr = __builtin_bit_cast(intptr_t, has_pointer{0}); // both-error {{constexpr variable 'ptr' must be initialized by a constant expression}} \ + // both-note {{bit_cast from a pointer type is not allowed in a constant expression}} + +#if 0 + // expected-error@+2 {{constexpr variable 'hptr' must be initialized by a constant expression}} + // expected-note@+1 {{bit_cast to a pointer type is not allowed in a constant expression}} + constexpr has_pointer hptr = __builtin_bit_cast(has_pointer, 0ul); +#endif +} + +void test_array_fill() { + constexpr unsigned char a[4] = {1, 2}; + constexpr unsigned int i = bit_cast(a); + static_assert(i == (LITTLE_END ? 0x00000201 : 0x01020000)); +} + +struct vol_mem { + volatile int x; +}; + +// both-error@+2 {{constexpr variable 'run_vol_mem' must be initialized by a constant expression}} +// both-note@+1 {{non-literal type 'vol_mem' cannot be used in a constant expression}} +constexpr int run_vol_mem = __builtin_bit_cast(int, vol_mem{43}); + +struct mem_ptr { + int vol_mem::*x; // both-note{{invalid type 'int vol_mem::*' is a member of 'mem_ptr'}} +}; + +// both-error@+2 {{constexpr variable 'run_mem_ptr' must be initialized by a constant expression}} +// both-note@+1 {{bit_cast from a member pointer type is not allowed in a constant expression}} +constexpr _BitInt(sizeof(mem_ptr) * 8) run_mem_ptr = __builtin_bit_cast(_BitInt(sizeof(mem_ptr) * 8), mem_ptr{nullptr}); + +constexpr int global_int = 0; + +struct ref_mem { + const int &rm; +}; +// both-error@+2 {{constexpr variable 'run_ref_mem' must be initialized by a constant expression}} +// both-note@+1 {{bit_cast from a type with a reference member is not allowed in a constant expression}} +constexpr intptr_t run_ref_mem = __builtin_bit_cast(intptr_t, ref_mem{global_int}); diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index 94fe2d4497df6..8466e9b88782f 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -274,6 +274,15 @@ namespace NowThrowNew { static_assert(erroneous_array_bound_nothrow2(-1) == 0);// expected-error {{not an integral constant expression}} static_assert(!erroneous_array_bound_nothrow2(1LL << 62));// expected-error {{not an integral constant expression}} + constexpr bool erroneous_array_bound(long long n) { + delete[] new int[n]; // both-note {{array bound -1 is negative}} both-note {{array bound 4611686018427387904 is too large}} + return true; + } + static_assert(erroneous_array_bound(3)); + static_assert(erroneous_array_bound(0)); + static_assert(erroneous_array_bound(-1)); // both-error {{constant expression}} both-note {{in call}} + static_assert(erroneous_array_bound(1LL << 62)); // both-error {{constant expression}} both-note {{in call}} + constexpr bool evaluate_nothrow_arg() { bool ok = false; delete new ((ok = true, std::nothrow)) int; @@ -569,6 +578,16 @@ namespace CastedDelete { return a; } static_assert(vdtor_1() == 1); + + constexpr int foo() { // both-error {{never produces a constant expression}} + struct S {}; + struct T : S {}; + S *p = new T(); + delete p; // both-note 2{{delete of object with dynamic type 'T' through pointer to base class type 'S' with non-virtual destructor}} + return 1; + } + static_assert(foo() == 1); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} } constexpr void use_after_free_2() { // both-error {{never produces a constant expression}} diff --git a/clang/test/AST/attr-swift_attr.m b/clang/test/AST/attr-swift_attr.m index 6ea6775aa5a9a..6888745fe95d4 100644 --- a/clang/test/AST/attr-swift_attr.m +++ b/clang/test/AST/attr-swift_attr.m @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fblocks -ast-dump %s | FileCheck %s __attribute__((swift_attr("@actor"))) @interface View @@ -14,3 +14,48 @@ @interface Contact // CHECK-LABEL: InterfaceDecl {{.*}} Contact // CHECK-NEXT: SwiftAttrAttr {{.*}} "@sendable" + +#define SWIFT_SENDABLE __attribute__((swift_attr("@Sendable"))) + +@interface InTypeContext +- (nullable id)test:(nullable SWIFT_SENDABLE id)obj SWIFT_SENDABLE; +@end + +// CHECK-LABEL: InterfaceDecl {{.*}} InTypeContext +// CHECK-NEXT: MethodDecl {{.*}} - test: 'id _Nullable':'id' +// CHECK-NEXT: ParmVarDecl {{.*}} obj 'SWIFT_SENDABLE id _Nullable':'id' +// CHECK-NEXT: SwiftAttrAttr {{.*}} "@Sendable" + +@interface Generic +@end + +// CHECK-LABEL: InterfaceDecl {{.*}} Generic +// CHECK-NEXT: TypeParamDecl {{.*}} T bounded 'SWIFT_SENDABLE id':'id' + +typedef SWIFT_SENDABLE Generic Alias; + +// CHECK-LABEL: TypedefDecl {{.*}} Alias 'Generic' +// CHECK-NEXT: ObjectType {{.*}} 'Generic' +// CHECK-NEXT: SwiftAttrAttr {{.*}} "@Sendable" + +SWIFT_SENDABLE +typedef struct { + void *ptr; +} SendableStruct; + +// CHECK-LABEL: TypedefDecl {{.*}} SendableStruct 'struct SendableStruct':'SendableStruct' +// CHECK: SwiftAttrAttr {{.*}} "@Sendable" + +@interface TestAttrPlacementInBlock1 +-(void) withHandler: (void (SWIFT_SENDABLE ^)(id)) handler; +@end + +// CHECK-LABEL: ObjCInterfaceDecl {{.*}} TestAttrPlacementInBlock1 +// CHECK: handler 'SWIFT_SENDABLE void (^)(id)':'void (^)(id)' + +@interface TestAttrPlacementInBlock2 +-(void) withHandler: (void (^ SWIFT_SENDABLE)(id)) handler; +@end + +// CHECK-LABEL: ObjCInterfaceDecl {{.*}} TestAttrPlacementInBlock2 +// CHECK: handler 'SWIFT_SENDABLE void (^)(id)':'void (^)(id)' diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h index 82c79c97a83de..8d95926e419be 100644 --- a/clang/test/Analysis/Checkers/WebKit/mock-types.h +++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h @@ -186,6 +186,8 @@ class CheckedObj { public: void incrementPtrCount(); void decrementPtrCount(); + void method(); + int trivial() { return 123; } }; class RefCountableAndCheckable { diff --git a/clang/test/Analysis/Checkers/WebKit/unchecked-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/unchecked-local-vars.cpp new file mode 100644 index 0000000000000..3bc75230fcf82 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/unchecked-local-vars.cpp @@ -0,0 +1,342 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncheckedLocalVarsChecker -verify %s + +#include "mock-types.h" +#include "mock-system-header.h" + +void someFunction(); + +namespace raw_ptr { +void foo() { + CheckedObj *bar; + // FIXME: later on we might warn on uninitialized vars too +} + +void bar(CheckedObj *) {} +} // namespace raw_ptr + +namespace reference { +void foo_ref() { + CheckedObj automatic; + CheckedObj &bar = automatic; + // expected-warning@-1{{Local variable 'bar' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + someFunction(); + bar.method(); +} + +void foo_ref_trivial() { + CheckedObj automatic; + CheckedObj &bar = automatic; +} + +void bar_ref(CheckedObj &) {} +} // namespace reference + +namespace guardian_scopes { +void foo1() { + CheckedPtr foo; + { CheckedObj *bar = foo.get(); } +} + +void foo2() { + CheckedPtr foo; + // missing embedded scope here + CheckedObj *bar = foo.get(); + // expected-warning@-1{{Local variable 'bar' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + someFunction(); + bar->method(); +} + +void foo3() { + CheckedPtr foo; + { + { CheckedObj *bar = foo.get(); } + } +} + +void foo4() { + { + CheckedPtr foo; + { CheckedObj *bar = foo.get(); } + } +} + +void foo5() { + CheckedPtr foo; + auto* bar = foo.get(); + bar->trivial(); +} + +void foo6() { + CheckedPtr foo; + auto* bar = foo.get(); + // expected-warning@-1{{Local variable 'bar' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + bar->method(); +} + +struct SelfReferencingStruct { + SelfReferencingStruct* ptr; + CheckedObj* obj { nullptr }; +}; + +void foo7(CheckedObj* obj) { + SelfReferencingStruct bar = { &bar, obj }; + bar.obj->method(); +} + +} // namespace guardian_scopes + +namespace auto_keyword { +class Foo { + CheckedObj *provide_ref_ctnbl(); + + void evil_func() { + CheckedObj *bar = provide_ref_ctnbl(); + // expected-warning@-1{{Local variable 'bar' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + auto *baz = provide_ref_ctnbl(); + // expected-warning@-1{{Local variable 'baz' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + auto *baz2 = this->provide_ref_ctnbl(); + // expected-warning@-1{{Local variable 'baz2' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + [[clang::suppress]] auto *baz_suppressed = provide_ref_ctnbl(); // no-warning + } + + void func() { + CheckedObj *bar = provide_ref_ctnbl(); + // expected-warning@-1{{Local variable 'bar' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + if (bar) + bar->method(); + } +}; +} // namespace auto_keyword + +namespace guardian_casts { +void foo1() { + CheckedPtr foo; + { + CheckedObj *bar = downcast(foo.get()); + bar->method(); + } + foo->method(); +} + +void foo2() { + CheckedPtr foo; + { + CheckedObj *bar = + static_cast(downcast(foo.get())); + someFunction(); + } +} +} // namespace guardian_casts + +namespace guardian_ref_conversion_operator { +void foo() { + CheckedRef rc; + { + CheckedObj &rr = rc; + rr.method(); + someFunction(); + } +} +} // namespace guardian_ref_conversion_operator + +namespace ignore_for_if { +CheckedObj *provide_ref_ctnbl() { return nullptr; } + +void foo() { + // no warnings + if (CheckedObj *a = provide_ref_ctnbl()) + a->trivial(); + for (CheckedObj *b = provide_ref_ctnbl(); b != nullptr;) + b->trivial(); + CheckedObj *array[1]; + for (CheckedObj *c : array) + c->trivial(); + while (CheckedObj *d = provide_ref_ctnbl()) + d->trivial(); + do { + CheckedObj *e = provide_ref_ctnbl(); + e->trivial(); + } while (1); + someFunction(); +} + +void bar() { + if (CheckedObj *a = provide_ref_ctnbl()) { + // expected-warning@-1{{Local variable 'a' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + a->method(); + } + for (CheckedObj *b = provide_ref_ctnbl(); b != nullptr;) { + // expected-warning@-1{{Local variable 'b' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + b->method(); + } + CheckedObj *array[1]; + for (CheckedObj *c : array) { + // expected-warning@-1{{Local variable 'c' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + c->method(); + } + + while (CheckedObj *d = provide_ref_ctnbl()) { + // expected-warning@-1{{Local variable 'd' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + d->method(); + } + do { + CheckedObj *e = provide_ref_ctnbl(); + // expected-warning@-1{{Local variable 'e' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + e->method(); + } while (1); + someFunction(); +} + +} // namespace ignore_for_if + +namespace ignore_system_headers { + +CheckedObj *provide_checkable(); + +void system_header() { + localVar(provide_checkable); +} + +} // ignore_system_headers + +namespace conditional_op { +CheckedObj *provide_checkable(); +bool bar(); + +void foo() { + CheckedObj *a = bar() ? nullptr : provide_checkable(); + // expected-warning@-1{{Local variable 'a' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + CheckedPtr b = provide_checkable(); + { + CheckedObj* c = bar() ? nullptr : b.get(); + c->method(); + CheckedObj* d = bar() ? b.get() : nullptr; + d->method(); + } +} + +} // namespace conditional_op + +namespace local_assignment_basic { + +CheckedObj *provide_checkable(); + +void foo(CheckedObj* a) { + CheckedObj* b = a; + // expected-warning@-1{{Local variable 'b' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + if (b->trivial()) + b = provide_checkable(); +} + +void bar(CheckedObj* a) { + CheckedObj* b; + // expected-warning@-1{{Local variable 'b' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + b = provide_checkable(); +} + +void baz() { + CheckedPtr a = provide_checkable(); + { + CheckedObj* b = a.get(); + // expected-warning@-1{{Local variable 'b' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + b = provide_checkable(); + } +} + +} // namespace local_assignment_basic + +namespace local_assignment_to_parameter { + +CheckedObj *provide_checkable(); +void someFunction(); + +void foo(CheckedObj* a) { + a = provide_checkable(); + // expected-warning@-1{{Assignment to an unchecked parameter 'a' is unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + someFunction(); + a->method(); +} + +} // namespace local_assignment_to_parameter + +namespace local_assignment_to_static_local { + +CheckedObj *provide_checkable(); +void someFunction(); + +void foo() { + static CheckedObj* a = nullptr; + // expected-warning@-1{{Static local variable 'a' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + a = provide_checkable(); + someFunction(); + a->method(); +} + +} // namespace local_assignment_to_static_local + +namespace local_assignment_to_global { + +CheckedObj *provide_ref_cntbl(); +void someFunction(); + +CheckedObj* g_a = nullptr; +// expected-warning@-1{{Global variable 'local_assignment_to_global::g_a' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + +void foo() { + g_a = provide_ref_cntbl(); + someFunction(); + g_a->method(); +} + +} // namespace local_assignment_to_global + +namespace local_refcountable_checkable_object { + +RefCountableAndCheckable* provide_obj(); + +void local_raw_ptr() { + RefCountableAndCheckable* a = nullptr; + // expected-warning@-1{{Local variable 'a' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + a = provide_obj(); + a->method(); +} + +void local_checked_ptr() { + RefPtr a = nullptr; + a = provide_obj(); + a->method(); +} + +void local_var_with_guardian_checked_ptr() { + RefPtr a = provide_obj(); + { + auto* b = a.get(); + b->method(); + } +} + +void local_var_with_guardian_checked_ptr_with_assignment() { + RefPtr a = provide_obj(); + { + RefCountableAndCheckable* b = a.get(); + // expected-warning@-1{{Local variable 'b' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + b = provide_obj(); + b->method(); + } +} + +void local_var_with_guardian_checked_ref() { + Ref a = *provide_obj(); + { + RefCountableAndCheckable& b = a; + b.method(); + } +} + +void static_var() { + static RefCountableAndCheckable* a = nullptr; + // expected-warning@-1{{Static local variable 'a' is unchecked and unsafe [alpha.webkit.UncheckedLocalVarsChecker]}} + a = provide_obj(); +} + +} // namespace local_refcountable_checkable_object diff --git a/clang/test/Analysis/Inputs/overloaded-delete-in-header.h b/clang/test/Analysis/Inputs/overloaded-delete-in-header.h new file mode 100644 index 0000000000000..8243961d84830 --- /dev/null +++ b/clang/test/Analysis/Inputs/overloaded-delete-in-header.h @@ -0,0 +1,18 @@ +#ifndef OVERLOADED_DELETE_IN_HEADER +#define OVERLOADED_DELETE_IN_HEADER + +struct DeleteInHeader { + int data; + static void operator delete(void *ptr); +}; + +void DeleteInHeader::operator delete(void *ptr) { + DeleteInHeader *self = (DeleteInHeader *)ptr; + self->data = 1; // no-warning: Still alive. + + ::operator delete(ptr); + + self->data = 2; // expected-warning {{Use of memory after it is freed [cplusplus.NewDelete]}} +} + +#endif // OVERLOADED_DELETE_IN_SYSTEM_HEADER diff --git a/clang/test/Analysis/overloaded-delete-in-system-header.cpp b/clang/test/Analysis/overloaded-delete-in-system-header.cpp new file mode 100644 index 0000000000000..c284a94206306 --- /dev/null +++ b/clang/test/Analysis/overloaded-delete-in-system-header.cpp @@ -0,0 +1,9 @@ +// RUN: %clang_analyze_cc1 -isystem %S/Inputs/ -verify %s \ +// RUN: -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete + +// RUN: %clang_analyze_cc1 -I %S/Inputs/ -verify %s \ +// RUN: -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete + +#include "overloaded-delete-in-header.h" + +void deleteInHeader(DeleteInHeader *p) { delete p; } diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt index 98829d53db934..5369dc92f69e8 100644 --- a/clang/test/CMakeLists.txt +++ b/clang/test/CMakeLists.txt @@ -80,6 +80,7 @@ list(APPEND CLANG_TEST_DEPS clang-nvlink-wrapper clang-offload-bundler clang-offload-packager + clang-sycl-linker diagtool hmaptool ) diff --git a/clang/test/CXX/temp/temp.decls/temp.spec.partial/temp.spec.partial.member/p2.cpp b/clang/test/CXX/temp/temp.decls/temp.spec.partial/temp.spec.partial.member/p2.cpp new file mode 100644 index 0000000000000..7969b7efe597f --- /dev/null +++ b/clang/test/CXX/temp/temp.decls/temp.spec.partial/temp.spec.partial.member/p2.cpp @@ -0,0 +1,85 @@ +// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify %s +// expected-no-diagnostics + +template +struct A { + template + struct B { + static constexpr int y = 0; + }; + + template + struct B { + static constexpr int y = 1; + }; + + template + static constexpr int x = 0; + + template + static constexpr int x = 1; +}; + +template +template +struct A::B { + static constexpr int y = 2; +}; + +template +template +constexpr int A::x = 2; + +static_assert(A::B::y == 0); +static_assert(A::B::y == 1); +static_assert(A::B::y == 2); +static_assert(A::x == 0); +static_assert(A::x == 1); +static_assert(A::x == 2); + +template<> +template +struct A::B { + static constexpr int y = 3; +}; + +template<> +template +struct A::B { + static constexpr int y = 4; +}; + +template<> +template +struct A::B { + static constexpr int y = 5; +}; + +template<> +template +constexpr int A::x = 3; + +template<> +template +constexpr int A::x = 4; + +template<> +template +constexpr int A::x = 5; + +static_assert(A::B::y == 3); +static_assert(A::B::y == 3); +static_assert(A::B::y == 3); +static_assert(A::B::y == 4); +static_assert(A::x == 3); +static_assert(A::x == 3); +static_assert(A::x == 3); +static_assert(A::x == 4); +static_assert(A::B::y == 0); +static_assert(A::B::y == 1); +static_assert(A::B::y == 2); +static_assert(A::B::y == 5); +static_assert(A::x == 0); +static_assert(A::x == 1); +static_assert(A::x == 2); +static_assert(A::x == 5); diff --git a/clang/test/CodeGen/2004-02-20-Builtins.c b/clang/test/CodeGen/2004-02-20-Builtins.c index 13f970127d606..4febe2fd30e1d 100644 --- a/clang/test/CodeGen/2004-02-20-Builtins.c +++ b/clang/test/CodeGen/2004-02-20-Builtins.c @@ -3,6 +3,9 @@ double sqrt(double x); // CHECK-LABEL: @zsqrtxxx // CHECK-NOT: builtin +// Don't search into metadata definitions. !llvm.ident can contain the +// substring "builtin" if it's in the source tree path. +// CHECK-LABEL: !llvm.ident void zsqrtxxx(float num) { num = sqrt(num); } diff --git a/clang/test/CodeGen/X86/amx_fp8.c b/clang/test/CodeGen/X86/amx_fp8.c new file mode 100644 index 0000000000000..9c79514f89129 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_fp8.c @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-fp8 \ +// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s +#include + +void test_amx(void *data) { + //CHECK-LABEL: @test_amx + //CHECK: call void @llvm.x86.tdpbf8ps(i8 1, i8 2, i8 3) + _tile_dpbf8ps(1, 2, 3); +} + +void test_amx2(void *data) { + //CHECK-LABEL: @test_amx2 + //CHECK: call void @llvm.x86.tdpbhf8ps(i8 1, i8 2, i8 3) + _tile_dpbhf8ps(1, 2, 3); +} + +void test_amx3(void *data) { + //CHECK-LABEL: @test_amx3 + //CHECK: call void @llvm.x86.tdphbf8ps(i8 1, i8 2, i8 3) + _tile_dphbf8ps(1, 2, 3); +} + +void test_amx4(void *data) { + //CHECK-LABEL: @test_amx4 + //CHECK: call void @llvm.x86.tdphf8ps(i8 1, i8 2, i8 3) + _tile_dphf8ps(1, 2, 3); +} diff --git a/clang/test/CodeGen/X86/amx_fp8_errors.c b/clang/test/CodeGen/X86/amx_fp8_errors.c new file mode 100644 index 0000000000000..77cbd34905b8b --- /dev/null +++ b/clang/test/CodeGen/X86/amx_fp8_errors.c @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-fp8 -verify + +#include + +void test_amx(void *data) { + _tile_dpbf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_dpbhf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_dphbf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_dphf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} +} diff --git a/clang/test/CodeGen/X86/amx_fp8_inline_asm.c b/clang/test/CodeGen/X86/amx_fp8_inline_asm.c new file mode 100644 index 0000000000000..49331bd9d368a --- /dev/null +++ b/clang/test/CodeGen/X86/amx_fp8_inline_asm.c @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-fp8 -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s + +void f_tilemul(short a) +{ + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdpbf8ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdpbf8ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); + + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdpbhf8ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdpbhf8ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); + + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdphbf8ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdphbf8ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); + + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdphf8ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdphf8ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); +} diff --git a/clang/test/CodeGen/X86/amx_transpose.c b/clang/test/CodeGen/X86/amx_transpose.c new file mode 100644 index 0000000000000..deefc592c7ae6 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_transpose.c @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \ +// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s + +#include +#include + +void test_tile_2rpntlvwz0(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0 + // CHECK: call void @llvm.x86.t2rpntlvwz0(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0(1, A, B); +} + +void test_tile_2rpntlvwz0t1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz0t1 + // CHECK: call void @llvm.x86.t2rpntlvwz0t1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz0t1(1, A, B); +} + +void test_tile_2rpntlvwz1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1 + // CHECK: call void @llvm.x86.t2rpntlvwz1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1(1, A, B); +} + +void test_tile_2rpntlvwz1t1(const void *A, size_t B) { + // CHECK-LABEL: @test_tile_2rpntlvwz1t1 + // CHECK: call void @llvm.x86.t2rpntlvwz1t1(i8 1, ptr %{{.*}}, i64 %{{.*}}) + _tile_2rpntlvwz1t1(1, A, B); +} + +void test_tile_transposed(void) +{ + // CHECK-LABEL: @test_tile_transposed + // CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2) + _tile_transposed(1, 2); +} diff --git a/clang/test/CodeGen/X86/amx_transpose_api.c b/clang/test/CodeGen/X86/amx_transpose_api.c new file mode 100644 index 0000000000000..10310c2332b7a --- /dev/null +++ b/clang/test/CodeGen/X86/amx_transpose_api.c @@ -0,0 +1,66 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \ +// RUN: -target-feature +amx-transpose \ +// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK + +#include + +char buf[2048]; +#define STRIDE 32 + +char buf2[2048]; + +void test_tile_2rpntlvwz0(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test_tile_2rpntlvwz0 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0(&dst0, &dst1, buf, STRIDE); +} + +void test_tile_2rpntlvwz0t1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test_tile_2rpntlvwz0t1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz0t1(&dst0, &dst1, buf, STRIDE); +} + +void test_tile_2rpntlvwz1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test_tile_2rpntlvwz1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1(&dst0, &dst1, buf, STRIDE); +} + +void test_tile_2rpntlvwz1t1(__tile1024i dst0, __tile1024i dst1) { + //CHECK-LABEL: @test_tile_2rpntlvwz1t1 + //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1 + //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}} + __tile_2rpntlvwz1t1(&dst0, &dst1, buf, STRIDE); +} + +void test_tile_transposed(__tile1024i dst, __tile1024i src) { + //CHECK-LABEL: @test_tile_transposed + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call x86_amx @llvm.x86.ttransposed.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + __tile_transposed(&dst, src); +} diff --git a/clang/test/CodeGen/X86/amx_transpose_errors.c b/clang/test/CodeGen/X86/amx_transpose_errors.c new file mode 100644 index 0000000000000..80084c42a240d --- /dev/null +++ b/clang/test/CodeGen/X86/amx_transpose_errors.c @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \ +// RUN: -target-feature +avx512f -target-feature +amx-element-evex -verify + +#include +#include +#include +#include + +// Transpose +void test_tile_2rpntlvwz0(const void *A, size_t B) { + _tile_2rpntlvwz0(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz0t1(const void *A, size_t B) { + _tile_2rpntlvwz0t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1(const void *A, size_t B) { + _tile_2rpntlvwz1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_2rpntlvwz1t1(const void *A, size_t B) { + _tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_tile_transposed() +{ + _tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + _tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} diff --git a/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c b/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c index 8d9ca671f30f8..f7ecf12d0becf 100644 --- a/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c +++ b/clang/test/CodeGen/X86/cmpccxadd-builtins-error.c @@ -10,3 +10,7 @@ int test_cmpccxadd32(void *__A, int __B, int __C) { long long test_cmpccxadd64(void *__A, long long __B, long long __C) { return _cmpccxadd_epi64(__A, __B, __C, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} } + +long long test_cmpccxadd64_2(int *__A, long long __B, long long __C) { + return _cmpccxadd_epi64(__A, __B, __C, 3); // expected-warning {{incompatible pointer types passing 'int *' to parameter of type 'long long *'}} +} diff --git a/clang/test/CodeGen/fat-lto-objects-cfi.cpp b/clang/test/CodeGen/fat-lto-objects-cfi.cpp new file mode 100644 index 0000000000000..628951847053a --- /dev/null +++ b/clang/test/CodeGen/fat-lto-objects-cfi.cpp @@ -0,0 +1,46 @@ +// REQUIRES: x86-registered-target + +// RUN: %clang_cc1 -triple x86_64-unknown-fuchsia -O2 -flto -ffat-lto-objects \ +// RUN: -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -fvisibility=hidden -emit-llvm -o - %s \ +// RUN: | FileCheck %s + +// CHECK: llvm.embedded.object +// CHECK-SAME: section ".llvm.lto" + +// CHECK-LABEL: define hidden void @foo +// CHECK: entry: +// CHECK-NEXT: %cmp14.not = icmp eq i64 %len, 0 +// CHECK-NEXT: br i1 %cmp14.not, label %for.end7, label %for.cond1.preheader.preheader +// CHECK: for.cond1.preheader.preheader: ; preds = %entry +// CHECK-NEXT: %arrayidx.1 = getelementptr inbounds nuw i8, ptr %ptr, i64 4 +// CHECK-NEXT: br label %for.cond1.preheader + +// CHECK-NOT: @llvm.type.test + +// The code below is a reduced case from https://github.com/llvm/llvm-project/issues/112053 +#define __PRINTFLIKE(__fmt, __varargs) __attribute__((__format__(__printf__, __fmt, __varargs))) +typedef void func(void* arg, const char* fmt, ...) __PRINTFLIKE(2, 3); +typedef __SIZE_TYPE__ size_t; +typedef unsigned long uintptr_t; + +extern "C" +void foo(const void* ptr, size_t len, long disp_addr, + func* printf_func, void* printf_arg) { + uintptr_t address = (uintptr_t)ptr; + size_t count; + + for (count = 0; count < len; count += 16) { + union { + unsigned int buf[4]; + unsigned char cbuf[16]; + } u; + size_t s = 10; + size_t i; + + for (i = 0; i < s / 4; i++) { + u.buf[i] = ((const unsigned int*)address)[i]; + printf_func(printf_arg, "%08x ", static_cast(u.buf[i])); + } + } +} + diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl deleted file mode 100644 index e1e047485e4df..0000000000000 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl +++ /dev/null @@ -1,24 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s - -RWBuffer Buffer1; -RWBuffer > BufferArray[4]; - -RWBuffer Buffer2 : register(u3); -RWBuffer > BufferArray2[4] : register(u4); - -RWBuffer Buffer3 : register(u3, space1); -RWBuffer > BufferArray3[4] : register(u4, space1); - - - -[numthreads(1,1,1)] -void main() { -} - -// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} -// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0} -// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0} -// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1} -// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl index fa81b53fd9bdd..16120a44a9e4d 100644 --- a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl @@ -54,17 +54,3 @@ void main(int GI : SV_GroupIndex) { BufF16x2[GI] = 0; BufF32x3[GI] = 0; } - -// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, -// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, -// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, -// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, -// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, diff --git a/clang/test/CodeGenHLSL/builtins/RWStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RWStructuredBuffer-elementtype.hlsl index 727f416cde57f..71b5b7a75fa43 100644 --- a/clang/test/CodeGenHLSL/builtins/RWStructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/RWStructuredBuffer-elementtype.hlsl @@ -54,17 +54,3 @@ void main(int GI : SV_GroupIndex) { BufF16x2[GI] = 0; BufF32x3[GI] = 0; } - -// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, -// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, -// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, -// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, -// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, diff --git a/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl deleted file mode 100644 index 5155f12902597..0000000000000 --- a/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl +++ /dev/null @@ -1,20 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s - -RasterizerOrderedBuffer Buffer1; -RasterizerOrderedBuffer > BufferArray[4]; - -RasterizerOrderedBuffer Buffer2 : register(u3); -RasterizerOrderedBuffer > BufferArray2[4] : register(u4); - -RasterizerOrderedBuffer Buffer3 : register(u3, space1); -RasterizerOrderedBuffer > BufferArray3[4] : register(u4, space1); - -void main() {} - -// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} -// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 true, i32 -1, i32 0} -// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 true, i32 -1, i32 0} -// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 true, i32 3, i32 0} -// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 true, i32 4, i32 0} -// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 true, i32 3, i32 1} -// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 true, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl deleted file mode 100644 index a88ea774f3320..0000000000000 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl +++ /dev/null @@ -1,22 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s - -StructuredBuffer Buffer1; -StructuredBuffer > BufferArray[4]; - -StructuredBuffer Buffer2 : register(t3); -StructuredBuffer > BufferArray2[4] : register(t4); - -StructuredBuffer Buffer3 : register(t3, space1); -StructuredBuffer > BufferArray3[4] : register(t4, space1); - -[numthreads(1,1,1)] -void main() { -} - -// CHECK: !hlsl.srvs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} -// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0} -// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0} -// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0} -// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1} -// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl index 4c30119498ff1..205e13b4de394 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl @@ -54,17 +54,3 @@ void main(int GI : SV_GroupIndex) { half2 v12 = BufF16x2[GI]; float3 v13 = BufF32x3[GI]; } - -// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3, -// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6, -// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9, -// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10, -// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2, -// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5, -// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8, -// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9, diff --git a/clang/test/CodeGenHLSL/cbuf.hlsl b/clang/test/CodeGenHLSL/cbuf.hlsl index 78d9768b22fc8..3f9d4514967dd 100644 --- a/clang/test/CodeGenHLSL/cbuf.hlsl +++ b/clang/test/CodeGenHLSL/cbuf.hlsl @@ -23,6 +23,4 @@ float foo() { } // CHECK: !hlsl.cbufs = !{![[CBMD:[0-9]+]]} -// CHECK: !hlsl.srvs = !{![[TBMD:[0-9]+]]} // CHECK: ![[CBMD]] = !{ptr @[[CB]], i32 13, i32 0, i1 false, i32 0, i32 2} -// CHECK: ![[TBMD]] = !{ptr @[[TB]], i32 15, i32 0, i1 false, i32 2, i32 1} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl index 3cf1056cf4f48..a4054cba236dd 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -4,6 +4,8 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + typedef unsigned int uint; typedef unsigned long ulong; @@ -19,12 +21,64 @@ void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0); } -// CHECK-LABEL: @test_mov_dpp8( -// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1) -void test_mov_dpp8(global uint* out, uint a) { +// CHECK-LABEL: @test_mov_dpp8_uint( +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1) +// CHECK-NEXT: store i32 %0, +void test_mov_dpp8_uint(global uint* out, uint a) { + *out = __builtin_amdgcn_mov_dpp8(a, 1); +} + +// CHECK-LABEL: @test_mov_dpp8_long( +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %a, i32 1) +// CHECK-NEXT: store i64 %0, +void test_mov_dpp8_long(global long* out, long a) { *out = __builtin_amdgcn_mov_dpp8(a, 1); } +// CHECK-LABEL: @test_mov_dpp8_float( +// CHECK: %0 = bitcast float %a to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) +// CHECK-NEXT: store i32 %1, +void test_mov_dpp8_float(global float* out, float a) { + *out = __builtin_amdgcn_mov_dpp8(a, 1); +} + +// CHECK-LABEL: @test_mov_dpp8_double +// CHECK: %0 = bitcast double %x to i64 +// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1) +// CHECK-NEXT: store i64 %1, +void test_mov_dpp8_double(double x, global double *p) { + *p = __builtin_amdgcn_mov_dpp8(x, 1); +} + +// CHECK-LABEL: @test_mov_dpp8_short +// CHECK: %0 = zext i16 %x to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) +// CHECK-NEXT: %2 = trunc i32 %1 to i16 +// CHECK-NEXT: store i16 %2, +void test_mov_dpp8_short(short x, global short *p) { + *p = __builtin_amdgcn_mov_dpp8(x, 1); +} + +// CHECK-LABEL: @test_mov_dpp8_char +// CHECK: %0 = zext i8 %x to i32 +// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) +// CHECK-NEXT: %2 = trunc i32 %1 to i8 +// CHECK-NEXT: store i8 %2, +void test_mov_dpp8_char(char x, global char *p) { + *p = __builtin_amdgcn_mov_dpp8(x, 1); +} + +// CHECK-LABEL: @test_mov_dpp8_half +// CHECK: %0 = load i16, +// CHECK: %1 = zext i16 %0 to i32 +// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1) +// CHECK-NEXT: %3 = trunc i32 %2 to i16 +// CHECK-NEXT: store i16 %3, +void test_mov_dpp8_half(half *x, global half *p) { + *p = __builtin_amdgcn_mov_dpp8(*x, 1); +} + // CHECK-LABEL: @test_s_memtime // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime() void test_s_memtime(global ulong* out) diff --git a/clang/test/Driver/avr-toolchain.c b/clang/test/Driver/avr-toolchain.c index 45ccf890febda..9d17476f30a69 100644 --- a/clang/test/Driver/avr-toolchain.c +++ b/clang/test/Driver/avr-toolchain.c @@ -44,7 +44,7 @@ // RUN: %clang -### --target=avr --sysroot=%S/Inputs/basic_avr_tree -S %s 2>&1 | FileCheck --check-prefixes=NOMCU,LINKA %s // RUN: %clang -### --target=avr --sysroot=%S/Inputs/ -S %s 2>&1 | FileCheck --check-prefixes=NOMCU,LINKA %s // RUN: %clang -### --target=avr --sysroot=%S/Inputs/basic_avr_tree %s 2>&1 | FileCheck --check-prefixes=NOMCU,LINKB %s -// NOMCU: warning: no target microcontroller specified on command line, cannot link standard libraries, please pass -mmcu= +// NOMCU: warning: no target microcontroller specified, please pass -mmcu= // LINKB: warning: standard library not linked and so no interrupt vector table or compiler runtime routines will be linked // LINKB: warning: support for passing the data section address to the linker for microcontroller '' is not implemented // NOMCU-NOT: warning: {{.*}} avr-gcc diff --git a/clang/test/Driver/clang-sycl-linker-test.cpp b/clang/test/Driver/clang-sycl-linker-test.cpp new file mode 100644 index 0000000000000..f358900b4fbd8 --- /dev/null +++ b/clang/test/Driver/clang-sycl-linker-test.cpp @@ -0,0 +1,48 @@ +// Tests the clang-sycl-linker tool. +// +// Test a simple case without arguments. +// RUN: %clangxx -emit-llvm -c %s -o %t_1.bc +// RUN: %clangxx -emit-llvm -c %s -o %t_2.bc +// RUN: clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=SIMPLE +// SIMPLE: "{{.*}}llvm-link{{.*}}" {{.*}}.bc {{.*}}.bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings +// SIMPLE-NEXT: "{{.*}}llvm-spirv{{.*}}" {{.*}}-o a.spv [[FIRSTLLVMLINKOUT]].bc +// +// Test that llvm-link is not called when only one input is present. +// RUN: clang-sycl-linker --dry-run -triple spirv64 %t_1.bc -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=SIMPLE-NO-LINK +// SIMPLE-NO-LINK: "{{.*}}llvm-spirv{{.*}}" {{.*}}-o a.spv {{.*}}.bc +// +// Test a simple case with device library files specified. +// RUN: touch %T/lib1.bc +// RUN: touch %T/lib2.bc +// RUN: clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs=lib1.bc,lib2.bc -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=DEVLIBS +// DEVLIBS: "{{.*}}llvm-link{{.*}}" {{.*}}.bc {{.*}}.bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings +// DEVLIBS-NEXT: "{{.*}}llvm-link{{.*}}" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}lib1.bc {{.*}}lib2.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// DEVLIBS-NEXT: "{{.*}}llvm-spirv{{.*}}" {{.*}}-o a.spv [[SECONDLLVMLINKOUT]].bc +// +// Test a simple case with .o (fat object) as input. +// TODO: Remove this test once fat object support is added. +// RUN: %clangxx -c %s -o %t.o +// RUN: not clang-sycl-linker --dry-run -triple spirv64 %t.o -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=FILETYPEERROR +// FILETYPEERROR: Unsupported file type +// +// Test to see if device library related errors are emitted. +// RUN: not clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs= -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=DEVLIBSERR1 +// DEVLIBSERR1: Number of device library files cannot be zero +// RUN: not clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs=lib1.bc,lib2.bc,lib3.bc -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=DEVLIBSERR2 +// DEVLIBSERR2: '{{.*}}lib3.bc' SYCL device library file is not found +// +// Test if correct set of llvm-spirv options are emitted for windows environment. +// RUN: clang-sycl-linker --dry-run -triple spirv64 --is-windows-msvc-env %t_1.bc %t_2.bc -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=LLVMOPTSWIN +// LLVMOPTSWIN: -spirv-debug-info-version=ocl-100 -spirv-allow-extra-diexpressions -spirv-allow-unknown-intrinsics=llvm.genx. -spirv-ext= +// +// Test if correct set of llvm-spirv options are emitted for linux environment. +// RUN: clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=LLVMOPTSLIN +// LLVMOPTSLIN: -spirv-debug-info-version=nonsemantic-shader-200 -spirv-allow-unknown-intrinsics=llvm.genx. -spirv-ext= diff --git a/clang/test/Driver/fprofile-generate-cold-function-coverage.c b/clang/test/Driver/fprofile-generate-cold-function-coverage.c index 9b2f46423f34b..135acf2e736f7 100644 --- a/clang/test/Driver/fprofile-generate-cold-function-coverage.c +++ b/clang/test/Driver/fprofile-generate-cold-function-coverage.c @@ -1,5 +1,6 @@ // RUN: %clang -### -c -fprofile-generate-cold-function-coverage %s 2>&1 | FileCheck %s // CHECK: "--instrument-cold-function-only-path=default_%m.profraw" +// CHECK: "--pgo-instrument-cold-function-only" // CHECK: "--pgo-function-entry-coverage" // CHECK-NOT: "-fprofile-instrument" // CHECK-NOT: "-fprofile-instrument-path= diff --git a/clang/test/Driver/index-header-map.c b/clang/test/Driver/index-header-map.c deleted file mode 100644 index 8bd677a0ba98a..0000000000000 --- a/clang/test/Driver/index-header-map.c +++ /dev/null @@ -1,4 +0,0 @@ -// RUN: %clang -I%S/Before -index-header-map -I%S/Index -I%S/After %s -### 2>> %t.log -// RUN: FileCheck %s < %t.log - -// CHECK: {{-I.*Before.*-index-header-map.*-I.*Index.*-I.*After}} diff --git a/clang/test/Driver/ps5-linker.c b/clang/test/Driver/ps5-linker.c index 2080f4dc91a7f..0fcc0f02f5a90 100644 --- a/clang/test/Driver/ps5-linker.c +++ b/clang/test/Driver/ps5-linker.c @@ -1,3 +1,10 @@ +// Test that a target emulation is supplied to the linker + +// RUN: %clang --target=x86_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-EMU %s + +// CHECK-EMU: {{ld(\.exe)?}}" +// CHECK-EMU-SAME: "-m" "elf_x86_64_fbsd" + // Test that PIE is the default for main components // RUN: %clang --target=x86_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-PIE %s @@ -14,6 +21,22 @@ // CHECK-NO-PIE-NOT: "-pie" // CHECK-SHARED: "--shared" +// Test the driver supplies an --image-base to the linker only for non-pie +// executables. + +// RUN: %clang --target=x86_64-sie-ps5 -static %s -### 2>&1 | FileCheck --check-prefixes=CHECK-BASE %s +// RUN: %clang --target=x86_64-sie-ps5 -no-pie %s -### 2>&1 | FileCheck --check-prefixes=CHECK-BASE %s + +// CHECK-BASE: {{ld(\.exe)?}}" +// CHECK-BASE-SAME: "--image-base=0x400000" + +// RUN: %clang --target=x86_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-BASE %s +// RUN: %clang --target=x86_64-sie-ps5 -r %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-BASE %s +// RUN: %clang --target=x86_64-sie-ps5 -shared %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-BASE %s + +// CHECK-NO-BASE: {{ld(\.exe)?}}" +// CHECK-NO-BASE-NOT: --image-base + // Test the driver passes PlayStation-specific options to the linker that are // appropriate for the type of output. Many options don't apply for relocatable // output (-r). @@ -30,6 +53,8 @@ // CHECK-EXE-SAME: "--unresolved-symbols=report-all" // CHECK-EXE-SAME: "-z" "now" // CHECK-EXE-SAME: "-z" "start-stop-visibility=hidden" +// CHECK-EXE-SAME: "-z" "common-page-size=0x4000" +// CHECK-EXE-SAME: "-z" "max-page-size=0x4000" // CHECK-EXE-SAME: "-z" "dead-reloc-in-nonalloc=.debug_*=0xffffffffffffffff" // CHECK-EXE-SAME: "-z" "dead-reloc-in-nonalloc=.debug_ranges=0xfffffffffffffffe" // CHECK-EXE-SAME: "-z" "dead-reloc-in-nonalloc=.debug_loc=0xfffffffffffffffe" diff --git a/clang/test/Driver/sycl-link-spirv-target.cpp b/clang/test/Driver/sycl-link-spirv-target.cpp new file mode 100644 index 0000000000000..85566c67ea92b --- /dev/null +++ b/clang/test/Driver/sycl-link-spirv-target.cpp @@ -0,0 +1,9 @@ +// Tests the driver when linking LLVM IR bitcode files and targeting SPIR-V +// architecture. +// +// Test that -Xlinker options are being passed to clang-sycl-linker. +// RUN: touch %t.bc +// RUN: %clangxx -### --target=spirv64 --sycl-link -Xlinker --llvm-spirv-path=/tmp \ +// RUN: -Xlinker --library-path=/tmp -Xlinker --device-libs=lib1.bc,lib2.bc %t.bc 2>&1 \ +// RUN: | FileCheck %s -check-prefix=XLINKEROPTS +// XLINKEROPTS: "{{.*}}clang-sycl-linker{{.*}}" "--llvm-spirv-path=/tmp" "--library-path=/tmp" "--device-libs=lib1.bc,lib2.bc" "{{.*}}.bc" "-o" "a.out" diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 02370ef60b7fe..e8c439ab48f21 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -304,6 +304,13 @@ // AMX-COMPLEX: "-target-feature" "+amx-complex" // NO-AMX-COMPLEX: "-target-feature" "-amx-complex" +// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-transpose %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TRANSPOSE %s +// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-transpose %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s +// AMX-TRANSPOSE: "-target-feature" "+amx-transpose" +// NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose" + // RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s // RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s // HRESET: "-target-feature" "+hreset" diff --git a/clang/test/Modules/diff-retain-comments-from-system-headers-flag.cppm b/clang/test/Modules/diff-retain-comments-from-system-headers-flag.cppm new file mode 100644 index 0000000000000..5ea896d6ae268 --- /dev/null +++ b/clang/test/Modules/diff-retain-comments-from-system-headers-flag.cppm @@ -0,0 +1,13 @@ +// RUN: rm -rf %t +// RUN: mkdir %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm -fretain-comments-from-system-headers +// RUN: %clang_cc1 -std=c++20 %t/b.cpp -fmodule-file=a=%t/a.pcm -verify -fsyntax-only + +//--- a.cppm +export module a; + +//--- b.cpp +// expected-no-diagnostics +import a; diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 2d1d2e57bdc77..c240b27c91a47 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -546,6 +546,18 @@ // NO-AMX-COMPLEX-NOT: #define __AMX_COMPLEX__ 1 +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-TRANSPOSE %s + +// AMX-TRANSPOSE: #define __AMX_TRANSPOSE__ 1 + +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-transpose -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -mno-amx-tile \ +// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s + +// NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s // AVXVNNI: #define __AVX2__ 1 diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index 688f55edfe84d..6a2af01ea5116 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -384,6 +384,19 @@ struct X { std::unique_ptr pointer; }; +struct [[gsl::Owner]] XOwner { + int* get() const [[clang::lifetimebound]]; +}; +struct X2 { + // A common usage that moves the passing owner to the class. + // verify no warning on this case. + X2(XOwner owner) : + pointee(owner.get()), + owner(std::move(owner)) {} + int* pointee; + XOwner owner; +}; + std::vector::iterator getIt(); std::vector getVec(); diff --git a/clang/test/SemaCXX/enum.cpp b/clang/test/SemaCXX/enum.cpp index 9c398cc8da886..44042d8bf5cfc 100644 --- a/clang/test/SemaCXX/enum.cpp +++ b/clang/test/SemaCXX/enum.cpp @@ -143,3 +143,11 @@ struct PR28903 { }) }; }; + +namespace GH112208 { +class C { + enum E { e = 0 }; + void f(int, enum E;); // expected-error {{ISO C++ forbids forward references to 'enum' types}} \ + // expected-error {{unexpected ';' before ')'}} +}; +} diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-function-attr.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-function-attr.cpp index bfc34b55c1f66..724d444638b57 100644 --- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-function-attr.cpp +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-function-attr.cpp @@ -111,6 +111,37 @@ int testFoldExpression(Vs&&... v) { return (... + v); // expected-warning{{function introduces unsafe buffer manipulation}} } +struct HoldsUnsafeMembers { + HoldsUnsafeMembers() + : FromCtor(3), // expected-warning{{function introduces unsafe buffer manipulation}} + FromCtor2{3} // expected-warning{{function introduces unsafe buffer manipulation}} + {} + + [[clang::unsafe_buffer_usage]] + HoldsUnsafeMembers(int i) + : FromCtor(i), // expected-warning{{function introduces unsafe buffer manipulation}} + FromCtor2{i} // expected-warning{{function introduces unsafe buffer manipulation}} + {} + + HoldsUnsafeMembers(float f) + : HoldsUnsafeMembers(0) {} // expected-warning{{function introduces unsafe buffer manipulation}} + + UnsafeMembers FromCtor; + UnsafeMembers FromCtor2; + UnsafeMembers FromField{3}; // expected-warning 2{{function introduces unsafe buffer manipulation}} +}; + +struct SubclassUnsafeMembers : public UnsafeMembers { + SubclassUnsafeMembers() + : UnsafeMembers(3) // expected-warning{{function introduces unsafe buffer manipulation}} + {} + + [[clang::unsafe_buffer_usage]] + SubclassUnsafeMembers(int i) + : UnsafeMembers(i) // expected-warning{{function introduces unsafe buffer manipulation}} + {} +}; + // https://github.com/llvm/llvm-project/issues/80482 void testClassMembers() { UnsafeMembers(3); // expected-warning{{function introduces unsafe buffer manipulation}} @@ -122,4 +153,95 @@ void testClassMembers() { UnsafeMembers()(); // expected-warning{{function introduces unsafe buffer manipulation}} testFoldExpression(UnsafeMembers(), UnsafeMembers()); + + HoldsUnsafeMembers(); + HoldsUnsafeMembers(3); // expected-warning{{function introduces unsafe buffer manipulation}} + + SubclassUnsafeMembers(); + SubclassUnsafeMembers(3); // expected-warning{{function introduces unsafe buffer manipulation}} +} + +// Not an aggregate, so its constructor is not implicit code and will be +// visited/checked for warnings. +struct NotCalledHoldsUnsafeMembers { + NotCalledHoldsUnsafeMembers() + : FromCtor(3), // expected-warning{{function introduces unsafe buffer manipulation}} + FromCtor2{3} // expected-warning{{function introduces unsafe buffer manipulation}} + {} + + UnsafeMembers FromCtor; + UnsafeMembers FromCtor2; + UnsafeMembers FromField{3}; // expected-warning{{function introduces unsafe buffer manipulation}} +}; + +// An aggregate, so its constructor is implicit code. Since it's not called, it +// is never generated. +struct AggregateUnused { + UnsafeMembers f1; + // While this field would trigger the warning during initialization, since + // it's unused, there's no code generated that does the initialization, so + // no warning. + UnsafeMembers f2{3}; +}; + +struct AggregateExplicitlyInitializedSafe { + UnsafeMembers f1; + // The warning is not fired as the field is always explicltly initialized + // elsewhere. This initializer is never used. + UnsafeMembers f2{3}; +}; + +void testAggregateExplicitlyInitializedSafe() { + AggregateExplicitlyInitializedSafe A{ + .f2 = UnsafeMembers(), // A safe constructor. + }; } + +struct AggregateExplicitlyInitializedUnsafe { + UnsafeMembers f1; + // The warning is not fired as the field is always explicltly initialized + // elsewhere. This initializer is never used. + UnsafeMembers f2{3}; +}; + +void testAggregateExplicitlyInitializedUnsafe() { + AggregateExplicitlyInitializedUnsafe A{ + .f2 = UnsafeMembers(3), // expected-warning{{function introduces unsafe buffer manipulation}} + }; +} + +struct AggregateViaAggregateInit { + UnsafeMembers f1; + // FIXME: A construction of this class does initialize the field through + // this initializer, so it should warn. Ideally it should also point to + // where the site of the construction is in testAggregateViaAggregateInit(). + UnsafeMembers f2{3}; +}; + +void testAggregateViaAggregateInit() { + AggregateViaAggregateInit A{}; +}; + +struct AggregateViaValueInit { + UnsafeMembers f1; + // FIXME: A construction of this class does initialize the field through + // this initializer, so it should warn. Ideally it should also point to + // where the site of the construction is in testAggregateViaValueInit(). + UnsafeMembers f2{3}; +}; + +void testAggregateViaValueInit() { + auto A = AggregateViaValueInit(); +}; + +struct AggregateViaDefaultInit { + UnsafeMembers f1; + // FIXME: A construction of this class does initialize the field through + // this initializer, so it should warn. Ideally it should also point to + // where the site of the construction is in testAggregateViaValueInit(). + UnsafeMembers f2{3}; +}; + +void testAggregateViaDefaultInit() { + AggregateViaDefaultInit A; +}; diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-in-container-span-construct.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-in-container-span-construct.cpp index e97511593bbd8..c138fe088b3ba 100644 --- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-in-container-span-construct.cpp +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-in-container-span-construct.cpp @@ -157,3 +157,23 @@ namespace test_flag { } } //namespace test_flag + +struct HoldsStdSpanAndInitializedInCtor { + char* Ptr; + unsigned Size; + std::span Span{Ptr, Size}; // no-warning (this code is unreachable) + + HoldsStdSpanAndInitializedInCtor(char* P, unsigned S) + : Span(P, S) // expected-warning{{the two-parameter std::span construction is unsafe as it can introduce mismatch between buffer size and the bound information}} + {} +}; + +struct HoldsStdSpanAndNotInitializedInCtor { + char* Ptr; + unsigned Size; + std::span Span{Ptr, Size}; // expected-warning{{the two-parameter std::span construction is unsafe as it can introduce mismatch between buffer size and the bound information}} + + HoldsStdSpanAndNotInitializedInCtor(char* P, unsigned S) + : Ptr(P), Size(S) + {} +}; diff --git a/clang/test/SemaObjC/validate-attr-swift_attr.m b/clang/test/SemaObjC/validate-attr-swift_attr.m index 2c73b0a892722..c7511098804a7 100644 --- a/clang/test/SemaObjC/validate-attr-swift_attr.m +++ b/clang/test/SemaObjC/validate-attr-swift_attr.m @@ -9,3 +9,7 @@ @interface I __attribute__((swift_attr(1))) @interface J @end + +@interface Error +// expected-error@-1 {{expected string literal as argument of 'swift_attr' attribute}} +@end diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10.cl index 02c8dc8c1339e..daae017142c79 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10.cl @@ -5,11 +5,30 @@ // RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx900 -verify -S -o - %s // RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx908 -verify -S -o - %s +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + typedef unsigned int uint; +typedef int int2 __attribute__((ext_vector_type(2))); +struct S { + int x; +}; void test(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlane16(a, b, c, d, 1, 1); // expected-error {{'__builtin_amdgcn_permlane16' needs target feature gfx10-insts}} *out = __builtin_amdgcn_permlanex16(a, b, c, d, 1, 1); // expected-error {{'__builtin_amdgcn_permlanex16' needs target feature gfx10-insts}} *out = __builtin_amdgcn_mov_dpp8(a, 1); // expected-error {{'__builtin_amdgcn_mov_dpp8' needs target feature gfx10-insts}} } + +void test_mov_dpp8(global int* out, int src, int i, int2 i2, struct S s, float _Complex fc) +{ + *out = __builtin_amdgcn_mov_dpp8(src, i); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp8(src, 0.1); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}} + *out = __builtin_amdgcn_mov_dpp8(src); // expected-error{{too few arguments to function call, expected 2, have 1}} + *out = __builtin_amdgcn_mov_dpp8(src, 0, 0); // expected-error{{too many arguments to function call, expected at most 2, have 3}} + *out = __builtin_amdgcn_mov_dpp8(out, 0); // expected-error{{used type '__global int *__private' where integer or floating point type is required}} + *out = __builtin_amdgcn_mov_dpp8("aa", 0); // expected-error{{used type '__constant char[3]' where integer or floating point type is required}} + *out = __builtin_amdgcn_mov_dpp8(i2, 0); // expected-error{{used type '__private int2' (vector of 2 'int' values) where integer or floating point type is required}} + *out = __builtin_amdgcn_mov_dpp8(s, 0); // expected-error{{used type '__private struct S' where integer or floating point type is required}} + *out = __builtin_amdgcn_mov_dpp8(fc, 0); // expected-error{{used type '__private _Complex float' where integer or floating point type is required}} +} diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index 92a3361ce672e..4d3469aba4bb8 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -96,6 +96,7 @@ "yaml2obj", "clang-linker-wrapper", "clang-nvlink-wrapper", + "clang-sycl-linker", "llvm-lto", "llvm-lto2", "llvm-profdata", diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt index 88e29412e5435..98c018e96848d 100644 --- a/clang/tools/CMakeLists.txt +++ b/clang/tools/CMakeLists.txt @@ -12,6 +12,7 @@ add_clang_subdirectory(clang-nvlink-wrapper) add_clang_subdirectory(clang-offload-packager) add_clang_subdirectory(clang-offload-bundler) add_clang_subdirectory(clang-scan-deps) +add_clang_subdirectory(clang-sycl-linker) add_clang_subdirectory(clang-installapi) if(HAVE_CLANG_REPL_SUPPORT) add_clang_subdirectory(clang-repl) diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 561b73c73ad7d..ebafd7eb7774e 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -216,33 +216,6 @@ void printCommands(ArrayRef CmdArgs) { exit(EXIT_FAILURE); } -/// Create an extra user-specified \p OffloadFile. -/// TODO: We should find a way to wrap these as libraries instead. -Expected getInputBitcodeLibrary(StringRef Input) { - auto [Device, Path] = StringRef(Input).split('='); - auto [String, Arch] = Device.rsplit('-'); - auto [Kind, Triple] = String.split('-'); - - llvm::ErrorOr> ImageOrError = - llvm::MemoryBuffer::getFileOrSTDIN(Path); - if (std::error_code EC = ImageOrError.getError()) - return createFileError(Path, EC); - - OffloadingImage Image{}; - Image.TheImageKind = IMG_Bitcode; - Image.TheOffloadKind = getOffloadKind(Kind); - Image.StringData["triple"] = Triple; - Image.StringData["arch"] = Arch; - Image.Image = std::move(*ImageOrError); - - std::unique_ptr Binary = - MemoryBuffer::getMemBufferCopy(OffloadBinary::write(Image)); - auto NewBinaryOrErr = OffloadBinary::create(*Binary); - if (!NewBinaryOrErr) - return NewBinaryOrErr.takeError(); - return OffloadFile(std::move(*NewBinaryOrErr), std::move(Binary)); -} - std::string getMainExecutable(const char *Name) { void *Ptr = (void *)(intptr_t)&getMainExecutable; auto COWPath = sys::fs::getMainExecutable(Name, Ptr); @@ -600,17 +573,6 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { for (StringRef Arg : Args.getAllArgValues(OPT_compiler_arg_EQ)) CmdArgs.push_back(Args.MakeArgString(Arg)); - for (StringRef Arg : Args.getAllArgValues(OPT_builtin_bitcode_EQ)) { - if (llvm::Triple(Arg.split('=').first) == Triple) - CmdArgs.append({"-Xclang", "-mlink-builtin-bitcode", "-Xclang", - Args.MakeArgString(Arg.split('=').second)}); - } - - // The OpenMPOpt pass can introduce new calls and is expensive, we do - // not want this when running CodeGen through clang. - if (Args.hasArg(OPT_clang_backend) || Args.hasArg(OPT_builtin_bitcode_EQ)) - CmdArgs.append({"-mllvm", "-openmp-opt-disable"}); - if (Error Err = executeCommands(*ClangPath, CmdArgs)) return std::move(Err); @@ -1362,13 +1324,6 @@ getDeviceInput(const ArgList &Args) { } } - for (StringRef Library : Args.getAllArgValues(OPT_bitcode_library_EQ)) { - auto FileOrErr = getInputBitcodeLibrary(Library); - if (!FileOrErr) - return FileOrErr.takeError(); - InputFiles[*FileOrErr].push_back(std::move(*FileOrErr)); - } - SmallVector> InputsForTarget; for (auto &[ID, Input] : InputFiles) InputsForTarget.emplace_back(std::move(Input)); diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index a3e8199380046..57d918db0a73c 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -22,22 +22,12 @@ def host_triple_EQ : Joined<["--"], "host-triple=">, def opt_level : Joined<["--"], "opt-level=">, Flags<[WrapperOnlyOption]>, MetaVarName<"">, HelpText<"Optimization level for LTO">; -def bitcode_library_EQ : Joined<["--"], "bitcode-library=">, - Flags<[WrapperOnlyOption]>, MetaVarName<"--=">, - HelpText<"Extra bitcode library to link">; -def builtin_bitcode_EQ : Joined<["--"], "builtin-bitcode=">, - Flags<[WrapperOnlyOption]>, MetaVarName<"=">, - HelpText<"Perform a special internalizing link on the bitcode file. " - "This is necessary for some vendor libraries to be linked correctly">; def device_linker_args_EQ : Joined<["--"], "device-linker=">, Flags<[WrapperOnlyOption]>, MetaVarName<" or =">, HelpText<"Arguments to pass to the device linker invocation">; def device_compiler_args_EQ : Joined<["--"], "device-compiler=">, Flags<[WrapperOnlyOption]>, MetaVarName<" or =">, HelpText<"Arguments to pass to the device compiler invocation">; -def clang_backend : Flag<["--"], "clang-backend">, - Flags<[WrapperOnlyOption]>, - HelpText<"Run the backend using clang rather than the LTO backend">; def dry_run : Flag<["--"], "dry-run">, Flags<[WrapperOnlyOption]>, HelpText<"Print program arguments without running">; diff --git a/clang/tools/clang-sycl-linker/CMakeLists.txt b/clang/tools/clang-sycl-linker/CMakeLists.txt new file mode 100644 index 0000000000000..5665ad7d7186e --- /dev/null +++ b/clang/tools/clang-sycl-linker/CMakeLists.txt @@ -0,0 +1,32 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + BinaryFormat + Option + Object + TargetParser + Support + ) + +set(LLVM_TARGET_DEFINITIONS SYCLLinkOpts.td) +tablegen(LLVM SYCLLinkOpts.inc -gen-opt-parser-defs) +add_public_tablegen_target(SYCLLinkerOpts) + +if(NOT CLANG_BUILT_STANDALONE) + set(tablegen_deps intrinsics_gen SYCLLinkerOpts) +endif() + +add_clang_tool(clang-sycl-linker + ClangSYCLLinker.cpp + + DEPENDS + ${tablegen_deps} + ) + +set(CLANG_SYCL_LINKER_LIB_DEPS + clangBasic + ) + +target_link_libraries(clang-sycl-linker + PRIVATE + ${CLANG_SYCL_LINKER_LIB_DEPS} + ) diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp new file mode 100644 index 0000000000000..076458a275d98 --- /dev/null +++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp @@ -0,0 +1,506 @@ +//=-------- clang-sycl-linker/ClangSYCLLinker.cpp - SYCL Linker util -------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// This tool executes a sequence of steps required to link device code in SYCL +// device images. SYCL device code linking requires a complex sequence of steps +// that include linking of llvm bitcode files, linking device library files +// with the fully linked source bitcode file(s), running several SYCL specific +// post-link steps on the fully linked bitcode file(s), and finally generating +// target-specific device code. +//===---------------------------------------------------------------------===// + +#include "clang/Basic/Version.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/BinaryFormat/Magic.h" +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/CodeGen/CommandFlags.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/LTO/LTO.h" +#include "llvm/Object/Archive.h" +#include "llvm/Object/ArchiveWriter.h" +#include "llvm/Object/Binary.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/IRObjectFile.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/OptTable.h" +#include "llvm/Option/Option.h" +#include "llvm/Remarks/HotnessThresholdParser.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileOutputBuffer.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/Signals.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/TimeProfiler.h" +#include "llvm/Support/WithColor.h" + +using namespace llvm; +using namespace llvm::opt; +using namespace llvm::object; + +/// Save intermediary results. +static bool SaveTemps = false; + +/// Print arguments without executing. +static bool DryRun = false; + +/// Print verbose output. +static bool Verbose = false; + +/// Filename of the output being created. +static StringRef OutputFile; + +/// Directory to dump SPIR-V IR if requested by user. +static SmallString<128> SPIRVDumpDir; + +static void printVersion(raw_ostream &OS) { + OS << clang::getClangToolFullVersion("clang-sycl-linker") << '\n'; +} + +/// The value of `argv[0]` when run. +static const char *Executable; + +/// Temporary files to be cleaned up. +static SmallVector> TempFiles; + +namespace { +// Must not overlap with llvm::opt::DriverFlag. +enum LinkerFlags { LinkerOnlyOption = (1 << 4) }; + +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__), +#include "SYCLLinkOpts.inc" + LastOption +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) \ + static constexpr StringLiteral NAME##_init[] = VALUE; \ + static constexpr ArrayRef NAME(NAME##_init, \ + std::size(NAME##_init) - 1); +#include "SYCLLinkOpts.inc" +#undef PREFIX + +static constexpr OptTable::Info InfoTable[] = { +#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__), +#include "SYCLLinkOpts.inc" +#undef OPTION +}; + +class LinkerOptTable : public opt::GenericOptTable { +public: + LinkerOptTable() : opt::GenericOptTable(InfoTable) {} +}; + +const OptTable &getOptTable() { + static const LinkerOptTable *Table = []() { + auto Result = std::make_unique(); + return Result.release(); + }(); + return *Table; +} + +[[noreturn]] void reportError(Error E) { + outs().flush(); + logAllUnhandledErrors(std::move(E), WithColor::error(errs(), Executable)); + exit(EXIT_FAILURE); +} + +std::string getMainExecutable(const char *Name) { + void *Ptr = (void *)(intptr_t)&getMainExecutable; + auto COWPath = sys::fs::getMainExecutable(Name, Ptr); + return sys::path::parent_path(COWPath).str(); +} + +Expected createTempFile(const ArgList &Args, const Twine &Prefix, + StringRef Extension) { + SmallString<128> OutputFile; + if (Args.hasArg(OPT_save_temps)) { + // Generate a unique path name without creating a file + sys::fs::createUniquePath(Prefix + "-%%%%%%." + Extension, OutputFile, + /*MakeAbsolute=*/false); + } else { + if (std::error_code EC = + sys::fs::createTemporaryFile(Prefix, Extension, OutputFile)) + return createFileError(OutputFile, EC); + } + + TempFiles.emplace_back(std::move(OutputFile)); + return TempFiles.back(); +} + +Expected findProgram(const ArgList &Args, StringRef Name, + ArrayRef Paths) { + if (Args.hasArg(OPT_dry_run)) + return Name.str(); + ErrorOr Path = sys::findProgramByName(Name, Paths); + if (!Path) + Path = sys::findProgramByName(Name); + if (!Path) + return createStringError(Path.getError(), + "Unable to find '" + Name + "' in path"); + return *Path; +} + +void printCommands(ArrayRef CmdArgs) { + if (CmdArgs.empty()) + return; + + llvm::errs() << " \"" << CmdArgs.front() << "\" "; + llvm::errs() << llvm::join(std::next(CmdArgs.begin()), CmdArgs.end(), " ") + << "\n"; +} + +/// Execute the command \p ExecutablePath with the arguments \p Args. +Error executeCommands(StringRef ExecutablePath, ArrayRef Args) { + if (Verbose || DryRun) + printCommands(Args); + + if (!DryRun) + if (sys::ExecuteAndWait(ExecutablePath, Args)) + return createStringError( + "'%s' failed", sys::path::filename(ExecutablePath).str().c_str()); + return Error::success(); +} + +Expected> getInput(const ArgList &Args) { + // Collect all input bitcode files to be passed to llvm-link. + SmallVector BitcodeFiles; + for (const opt::Arg *Arg : Args.filtered(OPT_INPUT)) { + std::optional Filename = std::string(Arg->getValue()); + if (!Filename || !sys::fs::exists(*Filename) || + sys::fs::is_directory(*Filename)) + continue; + file_magic Magic; + if (auto EC = identify_magic(*Filename, Magic)) + return createStringError("Failed to open file " + *Filename); + // TODO: Current use case involves LLVM IR bitcode files as input. + // This will be extended to support objects and SPIR-V IR files. + if (Magic != file_magic::bitcode) + return createStringError("Unsupported file type"); + BitcodeFiles.push_back(*Filename); + } + return BitcodeFiles; +} + +/// Link all SYCL device input files into one before adding device library +/// files. Device linking is performed using llvm-link tool. +/// 'InputFiles' is the list of all LLVM IR device input files. +/// 'Args' encompasses all arguments required for linking device code and will +/// be parsed to generate options required to be passed into llvm-link. +Expected linkDeviceInputFiles(ArrayRef InputFiles, + const ArgList &Args) { + llvm::TimeTraceScope TimeScope("SYCL LinkDeviceInputFiles"); + + assert(InputFiles.size() && "No inputs to llvm-link"); + // Early check to see if there is only one input. + if (InputFiles.size() < 2) + return InputFiles[0]; + + Expected LLVMLinkPath = + findProgram(Args, "llvm-link", {getMainExecutable("llvm-link")}); + if (!LLVMLinkPath) + return LLVMLinkPath.takeError(); + + SmallVector CmdArgs; + CmdArgs.push_back(*LLVMLinkPath); + for (auto &File : InputFiles) + CmdArgs.push_back(File); + // Create a new file to write the linked device file to. + auto OutFileOrErr = + createTempFile(Args, sys::path::filename(OutputFile), "bc"); + if (!OutFileOrErr) + return OutFileOrErr.takeError(); + CmdArgs.push_back("-o"); + CmdArgs.push_back(*OutFileOrErr); + CmdArgs.push_back("--suppress-warnings"); + if (Error Err = executeCommands(*LLVMLinkPath, CmdArgs)) + return std::move(Err); + return Args.MakeArgString(*OutFileOrErr); +} + +// This utility function is used to gather all SYCL device library files that +// will be linked with input device files. +// The list of files and its location are passed from driver. +Expected> getSYCLDeviceLibs(const ArgList &Args) { + SmallVector DeviceLibFiles; + StringRef LibraryPath; + if (Arg *A = Args.getLastArg(OPT_library_path_EQ)) + LibraryPath = A->getValue(); + if (LibraryPath.empty()) + return DeviceLibFiles; + if (Arg *A = Args.getLastArg(OPT_device_libs_EQ)) { + if (A->getValues().size() == 0) + return createStringError( + inconvertibleErrorCode(), + "Number of device library files cannot be zero."); + for (StringRef Val : A->getValues()) { + SmallString<128> LibName(LibraryPath); + llvm::sys::path::append(LibName, Val); + if (llvm::sys::fs::exists(LibName)) + DeviceLibFiles.push_back(std::string(LibName)); + else + return createStringError(inconvertibleErrorCode(), + "\'" + std::string(LibName) + "\'" + + " SYCL device library file is not found."); + } + } + return DeviceLibFiles; +} + +/// Link all device library files and input file into one LLVM IR file. This +/// linking is performed using llvm-link tool. +/// 'InputFiles' is the list of all LLVM IR device input files. +/// 'Args' encompasses all arguments required for linking device code and will +/// be parsed to generate options required to be passed into llvm-link tool. +static Expected linkDeviceLibFiles(StringRef InputFile, + const ArgList &Args) { + llvm::TimeTraceScope TimeScope("LinkDeviceLibraryFiles"); + + auto SYCLDeviceLibFiles = getSYCLDeviceLibs(Args); + if (!SYCLDeviceLibFiles) + return SYCLDeviceLibFiles.takeError(); + if ((*SYCLDeviceLibFiles).empty()) + return InputFile; + + Expected LLVMLinkPath = + findProgram(Args, "llvm-link", {getMainExecutable("llvm-link")}); + if (!LLVMLinkPath) + return LLVMLinkPath.takeError(); + + // Create a new file to write the linked device file to. + auto OutFileOrErr = + createTempFile(Args, sys::path::filename(OutputFile), "bc"); + if (!OutFileOrErr) + return OutFileOrErr.takeError(); + + SmallVector CmdArgs; + CmdArgs.push_back(*LLVMLinkPath); + CmdArgs.push_back("-only-needed"); + CmdArgs.push_back(InputFile); + for (auto &File : *SYCLDeviceLibFiles) + CmdArgs.push_back(File); + CmdArgs.push_back("-o"); + CmdArgs.push_back(*OutFileOrErr); + CmdArgs.push_back("--suppress-warnings"); + if (Error Err = executeCommands(*LLVMLinkPath, CmdArgs)) + return std::move(Err); + return *OutFileOrErr; +} + +/// Add any llvm-spirv option that relies on a specific Triple in addition +/// to user supplied options. +static void getSPIRVTransOpts(const ArgList &Args, + SmallVector &TranslatorArgs, + const llvm::Triple Triple) { + // Enable NonSemanticShaderDebugInfo.200 for non-Windows + const bool IsWindowsMSVC = + Triple.isWindowsMSVCEnvironment() || Args.hasArg(OPT_is_windows_msvc_env); + const bool EnableNonSemanticDebug = !IsWindowsMSVC; + if (EnableNonSemanticDebug) { + TranslatorArgs.push_back( + "-spirv-debug-info-version=nonsemantic-shader-200"); + } else { + TranslatorArgs.push_back("-spirv-debug-info-version=ocl-100"); + // Prevent crash in the translator if input IR contains DIExpression + // operations which don't have mapping to OpenCL.DebugInfo.100 spec. + TranslatorArgs.push_back("-spirv-allow-extra-diexpressions"); + } + std::string UnknownIntrinsics("-spirv-allow-unknown-intrinsics=llvm.genx."); + + TranslatorArgs.push_back(Args.MakeArgString(UnknownIntrinsics)); + + // Disable all the extensions by default + std::string ExtArg("-spirv-ext=-all"); + std::string DefaultExtArg = + ",+SPV_EXT_shader_atomic_float_add,+SPV_EXT_shader_atomic_float_min_max" + ",+SPV_KHR_no_integer_wrap_decoration,+SPV_KHR_float_controls" + ",+SPV_KHR_expect_assume,+SPV_KHR_linkonce_odr"; + std::string INTELExtArg = + ",+SPV_INTEL_subgroups,+SPV_INTEL_media_block_io" + ",+SPV_INTEL_device_side_avc_motion_estimation" + ",+SPV_INTEL_fpga_loop_controls,+SPV_INTEL_unstructured_loop_controls" + ",+SPV_INTEL_fpga_reg,+SPV_INTEL_blocking_pipes" + ",+SPV_INTEL_function_pointers,+SPV_INTEL_kernel_attributes" + ",+SPV_INTEL_io_pipes,+SPV_INTEL_inline_assembly" + ",+SPV_INTEL_arbitrary_precision_integers" + ",+SPV_INTEL_float_controls2,+SPV_INTEL_vector_compute" + ",+SPV_INTEL_fast_composite" + ",+SPV_INTEL_arbitrary_precision_fixed_point" + ",+SPV_INTEL_arbitrary_precision_floating_point" + ",+SPV_INTEL_variable_length_array,+SPV_INTEL_fp_fast_math_mode" + ",+SPV_INTEL_long_constant_composite" + ",+SPV_INTEL_arithmetic_fence" + ",+SPV_INTEL_global_variable_decorations" + ",+SPV_INTEL_cache_controls" + ",+SPV_INTEL_fpga_buffer_location" + ",+SPV_INTEL_fpga_argument_interfaces" + ",+SPV_INTEL_fpga_invocation_pipelining_attributes" + ",+SPV_INTEL_fpga_latency_control" + ",+SPV_INTEL_task_sequence" + ",+SPV_KHR_shader_clock" + ",+SPV_INTEL_bindless_images"; + ExtArg = ExtArg + DefaultExtArg + INTELExtArg; + ExtArg += ",+SPV_INTEL_token_type" + ",+SPV_INTEL_bfloat16_conversion" + ",+SPV_INTEL_joint_matrix" + ",+SPV_INTEL_hw_thread_queries" + ",+SPV_KHR_uniform_group_instructions" + ",+SPV_INTEL_masked_gather_scatter" + ",+SPV_INTEL_tensor_float32_conversion" + ",+SPV_INTEL_optnone" + ",+SPV_KHR_non_semantic_info" + ",+SPV_KHR_cooperative_matrix"; + TranslatorArgs.push_back(Args.MakeArgString(ExtArg)); +} + +/// Run LLVM to SPIR-V translation. +/// Converts 'File' from LLVM bitcode to SPIR-V format using llvm-spirv tool. +/// 'Args' encompasses all arguments required for linking device code and will +/// be parsed to generate options required to be passed into llvm-spirv tool. +static Expected runLLVMToSPIRVTranslation(StringRef File, + const ArgList &Args) { + llvm::TimeTraceScope TimeScope("LLVMToSPIRVTranslation"); + StringRef LLVMSPIRVPath = Args.getLastArgValue(OPT_llvm_spirv_path_EQ); + Expected LLVMToSPIRVProg = + findProgram(Args, "llvm-spirv", {LLVMSPIRVPath}); + if (!LLVMToSPIRVProg) + return LLVMToSPIRVProg.takeError(); + + SmallVector CmdArgs; + CmdArgs.push_back(*LLVMToSPIRVProg); + const llvm::Triple Triple(Args.getLastArgValue(OPT_triple)); + getSPIRVTransOpts(Args, CmdArgs, Triple); + StringRef LLVMToSPIRVOptions; + if (Arg *A = Args.getLastArg(OPT_llvm_spirv_options_EQ)) + LLVMToSPIRVOptions = A->getValue(); + LLVMToSPIRVOptions.split(CmdArgs, " ", /* MaxSplit = */ -1, + /* KeepEmpty = */ false); + CmdArgs.append({"-o", OutputFile}); + CmdArgs.push_back(File); + if (Error Err = executeCommands(*LLVMToSPIRVProg, CmdArgs)) + return std::move(Err); + + if (!SPIRVDumpDir.empty()) { + std::error_code EC = + llvm::sys::fs::create_directory(SPIRVDumpDir, /*IgnoreExisting*/ true); + if (EC) + return createStringError( + EC, + formatv("failed to create dump directory. path: {0}, error_code: {1}", + SPIRVDumpDir, EC.value())); + + StringRef Path = OutputFile; + StringRef Filename = llvm::sys::path::filename(Path); + SmallString<128> CopyPath = SPIRVDumpDir; + CopyPath.append(Filename); + EC = llvm::sys::fs::copy_file(Path, CopyPath); + if (EC) + return createStringError( + EC, + formatv( + "failed to copy file. original: {0}, copy: {1}, error_code: {2}", + Path, CopyPath, EC.value())); + } + + return OutputFile; +} + +Error runSYCLLink(ArrayRef Files, const ArgList &Args) { + llvm::TimeTraceScope TimeScope("SYCLDeviceLink"); + // First llvm-link step + auto LinkedFile = linkDeviceInputFiles(Files, Args); + if (!LinkedFile) + reportError(LinkedFile.takeError()); + + // second llvm-link step + auto DeviceLinkedFile = linkDeviceLibFiles(*LinkedFile, Args); + if (!DeviceLinkedFile) + reportError(DeviceLinkedFile.takeError()); + + // LLVM to SPIR-V translation step + auto SPVFile = runLLVMToSPIRVTranslation(*DeviceLinkedFile, Args); + if (!SPVFile) + return SPVFile.takeError(); + return Error::success(); +} + +} // namespace + +int main(int argc, char **argv) { + InitLLVM X(argc, argv); + + Executable = argv[0]; + sys::PrintStackTraceOnErrorSignal(argv[0]); + + const OptTable &Tbl = getOptTable(); + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + auto Args = Tbl.parseArgs(argc, argv, OPT_INVALID, Saver, [&](StringRef Err) { + reportError(createStringError(inconvertibleErrorCode(), Err)); + }); + + if (Args.hasArg(OPT_help) || Args.hasArg(OPT_help_hidden)) { + Tbl.printHelp( + outs(), "clang-sycl-linker [options] ", + "A utility that wraps around several steps required to link SYCL " + "device files.\n" + "This enables LLVM IR linking, post-linking and code generation for " + "SYCL targets.", + Args.hasArg(OPT_help_hidden), Args.hasArg(OPT_help_hidden)); + return EXIT_SUCCESS; + } + + if (Args.hasArg(OPT_version)) + printVersion(outs()); + + Verbose = Args.hasArg(OPT_verbose); + DryRun = Args.hasArg(OPT_dry_run); + SaveTemps = Args.hasArg(OPT_save_temps); + + OutputFile = "a.spv"; + if (Args.hasArg(OPT_o)) + OutputFile = Args.getLastArgValue(OPT_o); + + if (Args.hasArg(OPT_spirv_dump_device_code_EQ)) { + Arg *A = Args.getLastArg(OPT_spirv_dump_device_code_EQ); + SmallString<128> Dir(A->getValue()); + if (Dir.empty()) + llvm::sys::path::native(Dir = "./"); + else + Dir.append(llvm::sys::path::get_separator()); + + SPIRVDumpDir = Dir; + } + + // Get the input files to pass to the linking stage. + auto FilesOrErr = getInput(Args); + if (!FilesOrErr) + reportError(FilesOrErr.takeError()); + + // Run SYCL linking process on the generated inputs. + if (Error Err = runSYCLLink(*FilesOrErr, Args)) + reportError(std::move(Err)); + + // Remove the temporary files created. + if (!Args.hasArg(OPT_save_temps)) + for (const auto &TempFile : TempFiles) + if (std::error_code EC = sys::fs::remove(TempFile)) + reportError(createFileError(TempFile, EC)); + + return EXIT_SUCCESS; +} diff --git a/clang/tools/clang-sycl-linker/SYCLLinkOpts.td b/clang/tools/clang-sycl-linker/SYCLLinkOpts.td new file mode 100644 index 0000000000000..959fd6c3e867c --- /dev/null +++ b/clang/tools/clang-sycl-linker/SYCLLinkOpts.td @@ -0,0 +1,52 @@ +include "llvm/Option/OptParser.td" + +def LinkerOnlyOption : OptionFlag; + +def help : Flag<["-", "--"], "help">, + HelpText<"Display available options (--help-hidden for more)">; + +def help_hidden : Flag<["-", "--"], "help-hidden">, + HelpText<"Display all available options">; + +def verbose : Flag<["-"], "v">, HelpText<"Print verbose information">; +def version : Flag<["--"], "version">, + HelpText<"Display the version number and exit">; + +def o : JoinedOrSeparate<["-"], "o">, MetaVarName<"">, + HelpText<"Path to file to write output">; +def output : Separate<["--"], "output-file">, Alias, Flags<[HelpHidden]>, + HelpText<"Alias for -o">; + +def library_path_EQ : Joined<["--", "-"], "library-path=">, + Flags<[HelpHidden]>, HelpText<"Add to the library search path">; + +def device_libs_EQ : CommaJoined<["--", "-"], "device-libs=">, + Flags<[LinkerOnlyOption]>, + HelpText<"A comma separated list of device libraries that are linked during the device link.">; + +def triple : Joined<["--"], "triple">, + HelpText<"The device target triple">; +def arch : Separate<["--", "-"], "arch">, + HelpText<"Specify the name of the target architecture.">; + +def save_temps : Flag<["--", "-"], "save-temps">, + Flags<[LinkerOnlyOption]>, HelpText<"Save intermediate results">; + +def dry_run : Flag<["--", "-"], "dry-run">, Flags<[LinkerOnlyOption]>, + HelpText<"Print generated commands without running.">; + +def spirv_dump_device_code_EQ : Joined<["--", "-"], "spirv-dump-device-code=">, + Flags<[LinkerOnlyOption]>, + HelpText<"Path to the folder where the tool dumps SPIR-V device code. Other formats aren't dumped.">; + +def is_windows_msvc_env : Flag<["--", "-"], "is-windows-msvc-env">, + Flags<[LinkerOnlyOption, HelpHidden]>; + +def llvm_spirv_path_EQ : Joined<["--"], "llvm-spirv-path=">, + Flags<[LinkerOnlyOption]>, MetaVarName<"">, + HelpText<"Set the system llvm-spirv path">; + +// Options to pass to llvm-spirv tool +def llvm_spirv_options_EQ : Joined<["--", "-"], "llvm-spirv-options=">, + Flags<[LinkerOnlyOption]>, + HelpText<"Options that will control llvm-spirv step">; diff --git a/clang/unittests/AST/ByteCode/toAPValue.cpp b/clang/unittests/AST/ByteCode/toAPValue.cpp index 3f141878fb959..cd62338ee23c1 100644 --- a/clang/unittests/AST/ByteCode/toAPValue.cpp +++ b/clang/unittests/AST/ByteCode/toAPValue.cpp @@ -22,7 +22,9 @@ TEST(ToAPValue, Pointers) { "constexpr S d = {{{true, false}, {false, true}, {false, false}}};\n" "constexpr const bool *b = &d.a[1].z;\n" "const void *p = (void*)12;\n" - "const void *nullp = (void*)0;\n"; + "const void *nullp = (void*)0;\n" + "extern int earr[5][5];\n" + "constexpr const int *arrp = &earr[2][4];\n"; auto AST = tooling::buildASTFromCodeWithArgs( Code, {"-fexperimental-new-constant-interpreter"}); @@ -87,6 +89,22 @@ TEST(ToAPValue, Pointers) { ASSERT_TRUE(Success); ASSERT_EQ(I, 0); } + + // A multidimensional array. + { + const ValueDecl *D = getDecl("arrp"); + ASSERT_NE(D, nullptr); + const Pointer &GP = getGlobalPtr("arrp").deref(); + APValue A = GP.toAPValue(ASTCtx); + ASSERT_TRUE(A.isLValue()); + ASSERT_TRUE(A.hasLValuePath()); + ASSERT_EQ(A.getLValuePath().size(), 2u); + ASSERT_EQ(A.getLValuePath()[0].getAsArrayIndex(), 2u); + ASSERT_EQ(A.getLValuePath()[1].getAsArrayIndex(), 4u); + ASSERT_EQ(A.getLValueOffset().getQuantity(), 56u); + ASSERT_TRUE( + GP.atIndex(0).getFieldDesc()->getElemQualType()->isIntegerType()); + } } TEST(ToAPValue, FunctionPointers) { diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp index d696375547acc..056b7c7b571ef 100644 --- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp +++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp @@ -3342,6 +3342,45 @@ TEST_P(ASTMatchersTest, declStmt(isInTemplateInstantiation()))); } +TEST_P(ASTMatchersTest, IsInstantiated_MatchesVariableInstantiation) { + if (!GetParam().isCXX14OrLater()) { + return; + } + + EXPECT_TRUE(matches("template int V = 10; void x() { V; }", + varDecl(isInstantiated()))); +} + +TEST_P(ASTMatchersTest, IsInstantiated_NotMatchesVariableDefinition) { + if (!GetParam().isCXX14OrLater()) { + return; + } + + EXPECT_TRUE(notMatches("template int V = 10;", + varDecl(isInstantiated()))); +} + +TEST_P(ASTMatchersTest, + IsInTemplateInstantiation_MatchesVariableInstantiationStmt) { + if (!GetParam().isCXX14OrLater()) { + return; + } + + EXPECT_TRUE(matches( + "template auto V = []() { T i; }; void x() { V(); }", + declStmt(isInTemplateInstantiation()))); +} + +TEST_P(ASTMatchersTest, + IsInTemplateInstantiation_NotMatchesVariableDefinitionStmt) { + if (!GetParam().isCXX14OrLater()) { + return; + } + + EXPECT_TRUE(notMatches("template auto V = []() { T i; };", + declStmt(isInTemplateInstantiation()))); +} + TEST_P(ASTMatchersTest, IsInTemplateInstantiation_Sharing) { if (!GetParam().isCXX()) { return; diff --git a/clang/unittests/Lex/HeaderSearchTest.cpp b/clang/unittests/Lex/HeaderSearchTest.cpp index b0375d5985f2e..4d07150c04e8d 100644 --- a/clang/unittests/Lex/HeaderSearchTest.cpp +++ b/clang/unittests/Lex/HeaderSearchTest.cpp @@ -75,8 +75,7 @@ class HeaderSearchTest : public ::testing::Test { // Test class supports only one HMap at a time. assert(!HMap); HMap = HeaderMap::Create(*FE, FileMgr); - auto DL = - DirectoryLookup(HMap.get(), SrcMgr::C_User, /*isFramework=*/false); + auto DL = DirectoryLookup(HMap.get(), SrcMgr::C_User); Search.AddSearchPath(DL, isAngled); } @@ -251,7 +250,6 @@ TEST_F(HeaderSearchTest, HeaderFrameworkLookup) { auto FI = Search.getExistingFileInfo(FE); EXPECT_TRUE(FI); EXPECT_TRUE(FI->IsValid); - EXPECT_EQ(FI->Framework.str(), "Foo"); EXPECT_EQ(Search.getIncludeNameForHeader(FE), "Foo/Foo.h"); } @@ -321,7 +319,6 @@ TEST_F(HeaderSearchTest, HeaderMapFrameworkLookup) { auto FI = Search.getExistingFileInfo(FE); EXPECT_TRUE(FI); EXPECT_TRUE(FI->IsValid); - EXPECT_EQ(FI->Framework.str(), "Foo"); EXPECT_EQ(Search.getIncludeNameForHeader(FE), "Foo/Foo.h"); } diff --git a/clang/unittests/StaticAnalyzer/CMakeLists.txt b/clang/unittests/StaticAnalyzer/CMakeLists.txt index 5ef72cfaea401..f5da86e545603 100644 --- a/clang/unittests/StaticAnalyzer/CMakeLists.txt +++ b/clang/unittests/StaticAnalyzer/CMakeLists.txt @@ -20,6 +20,7 @@ add_clang_unittest(StaticAnalysisTests RegisterCustomCheckersTest.cpp StoreTest.cpp SymbolReaperTest.cpp + SValSimplifyerTest.cpp SValTest.cpp TestReturnValueUnderConstruction.cpp Z3CrosscheckOracleTest.cpp diff --git a/clang/unittests/StaticAnalyzer/SValSimplifyerTest.cpp b/clang/unittests/StaticAnalyzer/SValSimplifyerTest.cpp new file mode 100644 index 0000000000000..85cfe2c1965ac --- /dev/null +++ b/clang/unittests/StaticAnalyzer/SValSimplifyerTest.cpp @@ -0,0 +1,103 @@ +//===- unittests/StaticAnalyzer/SValSimplifyerTest.cpp --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CheckerRegistration.h" +#include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h" +#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" +#include "clang/StaticAnalyzer/Core/Checker.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" +#include "clang/StaticAnalyzer/Frontend/AnalysisConsumer.h" +#include "clang/StaticAnalyzer/Frontend/CheckerRegistry.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/raw_ostream.h" +#include "gtest/gtest.h" + +using namespace clang; +using namespace ento; + +static std::string toString(SVal V) { + std::string Result; + llvm::raw_string_ostream Stream(Result); + V.dumpToStream(Stream); + return Result; +} + +static void replace(std::string &Content, StringRef Substr, + StringRef Replacement) { + std::size_t Pos = 0; + while ((Pos = Content.find(Substr, Pos)) != std::string::npos) { + Content.replace(Pos, Substr.size(), Replacement); + Pos += Replacement.size(); + } +} + +namespace { + +class SimplifyChecker : public Checker { + const BugType Bug{this, "SimplifyChecker"}; + const CallDescription SimplifyCall{CDM::SimpleFunc, {"simplify"}, 1}; + + void report(CheckerContext &C, const Expr *E, StringRef Description) const { + PathDiagnosticLocation Loc(E->getExprLoc(), C.getSourceManager()); + auto Report = std::make_unique(Bug, Description, Loc); + C.emitReport(std::move(Report)); + } + +public: + void checkPreCall(const CallEvent &Call, CheckerContext &C) const { + if (!SimplifyCall.matches(Call)) + return; + const Expr *Arg = Call.getArgExpr(0); + SVal Val = C.getSVal(Arg); + SVal SimplifiedVal = C.getSValBuilder().simplifySVal(C.getState(), Val); + std::string Subject = toString(Val); + std::string Simplified = toString(SimplifiedVal); + std::string Message = (llvm::Twine{Subject} + " -> " + Simplified).str(); + report(C, Arg, Message); + } +}; +} // namespace + +static void addSimplifyChecker(AnalysisASTConsumer &AnalysisConsumer, + AnalyzerOptions &AnOpts) { + AnOpts.CheckersAndPackages = {{"SimplifyChecker", true}}; + AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) { + Registry.addChecker("SimplifyChecker", "EmptyDescription", + "EmptyDocsUri"); + }); +} + +static void runThisCheckerOnCode(const std::string &Code, std::string &Diags) { + ASSERT_TRUE(runCheckerOnCode(Code, Diags, + /*OnlyEmitWarnings=*/true)); + ASSERT_FALSE(Diags.empty()); + ASSERT_EQ(Diags.back(), '\n'); + Diags.pop_back(); +} + +namespace { + +TEST(SValSimplifyerTest, LHSConstrainedNullPtrDiff) { + constexpr auto Code = R"cpp( +template void simplify(T); +void LHSConstrainedNullPtrDiff(char *p, char *q) { + int diff = p - q; + if (!p) + simplify(diff); +})cpp"; + + std::string Diags; + runThisCheckerOnCode(Code, Diags); + replace(Diags, "(reg_$0)", "reg_p"); + replace(Diags, "(reg_$1)", "reg_q"); + // This should not be simplified to "Unknown". + EXPECT_EQ(Diags, "SimplifyChecker: reg_p - reg_q -> 0U - reg_q"); +} + +} // namespace diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html index 5d0b7aaccd89d..d48b3bebe7611 100755 --- a/clang/www/OpenProjects.html +++ b/clang/www/OpenProjects.html @@ -160,8 +160,8 @@

Open Clang Projects

If you hit a bug with Clang, it is very useful for us if you reduce the code that demonstrates the problem down to something small. There are many ways to do this; ask on Discourse, -Discord, -or for advice.

+Discord +for advice.

diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 890d6c11c4076..a480ea23751d7 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -431,9 +431,12 @@ INTERCEPTOR(void, free, void *ptr) { if (DlsymAlloc::PointerIsMine(ptr)) return DlsymAlloc::Free(ptr); - if (ptr != NULL) { + // According to the C and C++ standard, freeing a nullptr is guaranteed to be + // a no-op (and thus real-time safe). This can be confirmed for looking at + // __libc_free in the glibc source. + if (ptr != nullptr) __rtsan_notify_intercepted_call("free"); - } + return REAL(free)(ptr); } @@ -464,10 +467,19 @@ INTERCEPTOR(void *, valloc, SIZE_T size) { } #if SANITIZER_INTERCEPT_ALIGNED_ALLOC + +// In some cases, when targeting older Darwin versions, this warning may pop up. +// Because we are providing a wrapper, the client is responsible to check +// whether aligned_alloc is available, not us. We still succeed linking on an +// old OS, because we are using a weak symbol (see aligned_alloc in +// sanitizer_platform_interceptors.h) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunguarded-availability-new" INTERCEPTOR(void *, aligned_alloc, SIZE_T alignment, SIZE_T size) { __rtsan_notify_intercepted_call("aligned_alloc"); return REAL(aligned_alloc)(alignment, size); } +#pragma clang diagnostic pop #define RTSAN_MAYBE_INTERCEPT_ALIGNED_ALLOC INTERCEPT_FUNCTION(aligned_alloc) #else #define RTSAN_MAYBE_INTERCEPT_ALIGNED_ALLOC diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 6233c3e91800e..38274485c29f6 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -122,13 +122,20 @@ TEST(TestRtsanInterceptors, VallocDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } -#if SANITIZER_INTERCEPT_ALIGNED_ALLOC +#if __has_builtin(__builtin_available) && SANITIZER_APPLE +#define ALIGNED_ALLOC_AVAILABLE() (__builtin_available(macOS 10.15, *)) +#else +// We are going to assume this is true until we hit systems where it isn't +#define ALIGNED_ALLOC_AVAILABLE() (true) +#endif + TEST(TestRtsanInterceptors, AlignedAllocDiesWhenRealtime) { - auto Func = []() { EXPECT_NE(nullptr, aligned_alloc(16, 32)); }; - ExpectRealtimeDeath(Func, "aligned_alloc"); - ExpectNonRealtimeSurvival(Func); + if (ALIGNED_ALLOC_AVAILABLE()) { + auto Func = []() { EXPECT_NE(nullptr, aligned_alloc(16, 32)); }; + ExpectRealtimeDeath(Func, "aligned_alloc"); + ExpectNonRealtimeSurvival(Func); + } } -#endif // free_sized and free_aligned_sized (both C23) are not yet supported TEST(TestRtsanInterceptors, FreeDiesWhenRealtime) { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp index 33107eb0b4299..8b1850f85010c 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp @@ -256,6 +256,11 @@ int internal_madvise(uptr addr, uptr length, int advice) { return internal_syscall(SYSCALL(madvise), addr, length, advice); } +# if SANITIZER_FREEBSD +uptr internal_close_range(fd_t lowfd, fd_t highfd, int flags) { + return internal_syscall(SYSCALL(close_range), lowfd, highfd, flags); +} +# endif uptr internal_close(fd_t fd) { return internal_syscall(SYSCALL(close), fd); } uptr internal_open(const char *filename, int flags) { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index 6959a6d52d604..3fd6b595ef197 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -84,6 +84,25 @@ #define SI_NOT_MAC 1 #endif +#if SANITIZER_APPLE +# include + +// aligned_alloc was introduced in OSX 10.15 +// Linking will fail when using an older SDK +# if defined(__MAC_10_15) +// macOS 10.15 is greater than our minimal deployment target. To ensure we +// generate a weak reference so the dylib continues to work on older +// systems, we need to forward declare the intercepted function as "weak +// imports". +SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, + __sanitizer::usize __size); +# define SI_MAC_SDK_10_15_AVAILABLE 1 +# else +# define SI_MAC_SDK_10_15_AVAILABLE 0 +# endif // defined(__MAC_10_15) + +#endif // SANITIZER_APPLE + #if SANITIZER_IOS #define SI_IOS 1 #else @@ -500,7 +519,8 @@ #define SANITIZER_INTERCEPT_PVALLOC (SI_GLIBC || SI_ANDROID) #define SANITIZER_INTERCEPT_CFREE (SI_GLIBC && !SANITIZER_RISCV64) #define SANITIZER_INTERCEPT_REALLOCARRAY SI_POSIX -#define SANITIZER_INTERCEPT_ALIGNED_ALLOC (!SI_MAC) +#define SANITIZER_INTERCEPT_ALIGNED_ALLOC \ + (!SI_MAC || SI_MAC_SDK_10_15_AVAILABLE) #define SANITIZER_INTERCEPT_MALLOC_USABLE_SIZE (!SI_MAC && !SI_NETBSD) #define SANITIZER_INTERCEPT_MCHECK_MPROBE SI_LINUX_NOT_ANDROID #define SANITIZER_INTERCEPT_WCSLEN 1 diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h index 1f0795caa420c..b5491c540dc08 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h @@ -28,6 +28,9 @@ namespace __sanitizer { // Don't use directly, use __sanitizer::OpenFile() instead. uptr internal_open(const char *filename, int flags); uptr internal_open(const char *filename, int flags, u32 mode); +# if SANITIZER_FREEBSD +uptr internal_close_range(fd_t lowfd, fd_t highfd, int flags); +# endif uptr internal_close(fd_t fd); uptr internal_read(fd_t fd, void *buf, uptr count); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp index 7ee2319456d23..b1eb2009cf157 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp @@ -543,7 +543,11 @@ pid_t StartSubprocess(const char *program, const char *const argv[], internal_close(stderr_fd); } +# if SANITIZER_FREEBSD + internal_close_range(3, ~static_cast(0), 0); +# else for (int fd = sysconf(_SC_OPEN_MAX); fd > 2; fd--) internal_close(fd); +# endif internal_execve(program, const_cast(&argv[0]), const_cast(envp)); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index a41f0f348f27a..d89713a9fc0b9 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -432,7 +432,7 @@ def CUFDeviceGlobal : Pass<"cuf-device-global", "mlir::ModuleOp"> { let summary = "Flag globals used in device function with data attribute"; let dependentDialects = [ - "cuf::CUFDialect" + "cuf::CUFDialect", "mlir::gpu::GPUDialect", "mlir::NVVM::NVVMDialect" ]; } diff --git a/flang/include/flang/Runtime/CUDA/memory.h b/flang/include/flang/Runtime/CUDA/memory.h index 3c3ae73d4ad7a..6d2e0c0f15942 100644 --- a/flang/include/flang/Runtime/CUDA/memory.h +++ b/flang/include/flang/Runtime/CUDA/memory.h @@ -28,7 +28,7 @@ void RTDECL(CUFMemFree)(void *devicePtr, unsigned type, /// Set value to the data hold by a descriptor. The \p value pointer must be /// addressable to the same amount of bytes specified by the element size of /// the descriptor \p desc. -void RTDECL(CUFMemsetDescriptor)(const Descriptor &desc, void *value, +void RTDECL(CUFMemsetDescriptor)(Descriptor *desc, void *value, const char *sourceFile = nullptr, int sourceLine = 0); /// Data transfer from a pointer to a pointer. @@ -36,19 +36,18 @@ void RTDECL(CUFDataTransferPtrPtr)(void *dst, void *src, std::size_t bytes, unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0); /// Data transfer from a pointer to a descriptor. -void RTDECL(CUFDataTransferDescPtr)(const Descriptor &dst, void *src, +void RTDECL(CUFDataTransferDescPtr)(Descriptor *dst, void *src, std::size_t bytes, unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0); /// Data transfer from a descriptor to a pointer. -void RTDECL(CUFDataTransferPtrDesc)(void *dst, const Descriptor &src, +void RTDECL(CUFDataTransferPtrDesc)(void *dst, Descriptor *src, std::size_t bytes, unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0); /// Data transfer from a descriptor to a descriptor. -void RTDECL(CUFDataTransferDescDesc)(const Descriptor &dst, - const Descriptor &src, unsigned mode, const char *sourceFile = nullptr, - int sourceLine = 0); +void RTDECL(CUFDataTransferDescDesc)(Descriptor *dst, Descriptor *src, + unsigned mode, const char *sourceFile = nullptr, int sourceLine = 0); } // extern "C" } // namespace Fortran::runtime::cuda diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 877fe122265dd..0e3011e73902d 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2131,18 +2131,37 @@ class FirConverter : public Fortran::lower::AbstractConverter { llvm::SmallVectorImpl &dirs) { assert(!incrementLoopNestInfo.empty() && "empty loop nest"); mlir::Location loc = toLocation(); + mlir::Operation *boundsAndStepIP = nullptr; + for (IncrementLoopInfo &info : incrementLoopNestInfo) { - info.loopVariable = - genLoopVariableAddress(loc, *info.loopVariableSym, info.isUnordered); - mlir::Value lowerValue = genControlValue(info.lowerExpr, info); - mlir::Value upperValue = genControlValue(info.upperExpr, info); - bool isConst = true; - mlir::Value stepValue = genControlValue( - info.stepExpr, info, info.isStructured() ? nullptr : &isConst); - // Use a temp variable for unstructured loops with non-const step. - if (!isConst) { - info.stepVariable = builder->createTemporary(loc, stepValue.getType()); - builder->create(loc, stepValue, info.stepVariable); + mlir::Value lowerValue; + mlir::Value upperValue; + mlir::Value stepValue; + + { + mlir::OpBuilder::InsertionGuard guard(*builder); + + // Set the IP before the first loop in the nest so that all nest bounds + // and step values are created outside the nest. + if (boundsAndStepIP) + builder->setInsertionPointAfter(boundsAndStepIP); + + info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym, + info.isUnordered); + lowerValue = genControlValue(info.lowerExpr, info); + upperValue = genControlValue(info.upperExpr, info); + bool isConst = true; + stepValue = genControlValue(info.stepExpr, info, + info.isStructured() ? nullptr : &isConst); + boundsAndStepIP = stepValue.getDefiningOp(); + + // Use a temp variable for unstructured loops with non-const step. + if (!isConst) { + info.stepVariable = + builder->createTemporary(loc, stepValue.getType()); + boundsAndStepIP = + builder->create(loc, stepValue, info.stepVariable); + } } // Structured loop - generate fir.do_loop. diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 84985b880b1ec..329cbf3d7539f 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -589,10 +589,27 @@ static void genLoopVars( llvm::SmallVector locs(args.size(), loc); firOpBuilder.createBlock(®ion, {}, tiv, locs); + // Update nested wrapper operands if parent wrappers have mapped these values + // to block arguments. + // + // Binding these values earlier would take care of this, but we cannot rely on + // that approach because binding in between the creation of a wrapper and the + // next one would result in 'hlfir.declare' operations being introduced inside + // of a wrapper, which is illegal. + mlir::IRMapping mapper; + for (auto [argGeneratingOp, blockArgs] : wrapperArgs) { + for (mlir::OpOperand &operand : argGeneratingOp->getOpOperands()) + operand.set(mapper.lookupOrDefault(operand.get())); + + for (const auto [arg, var] : llvm::zip_equal( + argGeneratingOp->getRegion(0).getArguments(), blockArgs.getVars())) + mapper.map(var, arg); + } + // Bind the entry block arguments of parent wrappers to the corresponding // symbols. - for (auto [argGeneratingOp, args] : wrapperArgs) - bindEntryBlockArgs(converter, argGeneratingOp, args); + for (auto [argGeneratingOp, blockArgs] : wrapperArgs) + bindEntryBlockArgs(converter, argGeneratingOp, blockArgs); // The argument is not currently in memory, so make a temporary for the // argument, and store it there, then bind that location to the argument. diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index e6eeb0d5db4a8..d038efcb2eb42 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -2931,6 +2931,9 @@ struct GlobalOpConversion : public fir::FIROpConversion { comdatOp = rewriter.create(module.getLoc(), comdatName); } + if (auto select = comdatOp.lookupSymbol( + global.getSymName())) + return; mlir::OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointToEnd(&comdatOp.getBody().back()); auto selectorOp = rewriter.create( @@ -2949,9 +2952,10 @@ struct LoadOpConversion : public fir::FIROpConversion { llvm::LogicalResult matchAndRewrite(fir::LoadOp load, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { + mlir::Type llvmLoadTy = convertObjectType(load.getType()); if (auto boxTy = mlir::dyn_cast(load.getType())) { - // fir.box is a special case because it is considered as an ssa values in + // fir.box is a special case because it is considered an ssa value in // fir, but it is lowered as a pointer to a descriptor. So // fir.ref and fir.box end up being the same llvm types and // loading a fir.ref is implemented as taking a snapshot of the @@ -2960,30 +2964,17 @@ struct LoadOpConversion : public fir::FIROpConversion { mlir::Location loc = load.getLoc(); auto newBoxStorage = genAllocaAndAddrCastWithType(loc, llvmLoadTy, defaultAlign, rewriter); - // TODO: always generate llvm.memcpy, LLVM is better at optimizing it than - // aggregate loads + stores. - if (boxTy.isAssumedRank()) { - - TypePair boxTypePair{boxTy, llvmLoadTy}; - mlir::Value boxSize = - computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter); - auto memcpy = rewriter.create( - loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false); - if (std::optional optionalTag = load.getTbaa()) - memcpy.setTBAATags(*optionalTag); - else - attachTBAATag(memcpy, boxTy, boxTy, nullptr); - } else { - auto boxValue = rewriter.create(loc, llvmLoadTy, - inputBoxStorage); - if (std::optional optionalTag = load.getTbaa()) - boxValue.setTBAATags(*optionalTag); - else - attachTBAATag(boxValue, boxTy, boxTy, nullptr); - auto storeOp = - rewriter.create(loc, boxValue, newBoxStorage); - attachTBAATag(storeOp, boxTy, boxTy, nullptr); - } + + TypePair boxTypePair{boxTy, llvmLoadTy}; + mlir::Value boxSize = + computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter); + auto memcpy = rewriter.create( + loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false); + + if (std::optional optionalTag = load.getTbaa()) + memcpy.setTBAATags(*optionalTag); + else + attachTBAATag(memcpy, boxTy, boxTy, nullptr); rewriter.replaceOp(load, newBoxStorage); } else { auto loadOp = rewriter.create( @@ -3227,20 +3218,13 @@ struct StoreOpConversion : public fir::FIROpConversion { mlir::LLVM::AliasAnalysisOpInterface newOp; if (auto boxTy = mlir::dyn_cast(storeTy)) { mlir::Type llvmBoxTy = lowerTy().convertBoxTypeAsStruct(boxTy); - // fir.box value is actually in memory, load it first before storing it, - // or do a memcopy for assumed-rank descriptors. - if (boxTy.isAssumedRank()) { - TypePair boxTypePair{boxTy, llvmBoxTy}; - mlir::Value boxSize = - computeBoxSize(loc, boxTypePair, llvmValue, rewriter); - newOp = rewriter.create( - loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false); - } else { - auto val = - rewriter.create(loc, llvmBoxTy, llvmValue); - attachTBAATag(val, boxTy, boxTy, nullptr); - newOp = rewriter.create(loc, val, llvmMemref); - } + // Always use memcpy because LLVM is not as effective at optimizing + // aggregate loads/stores as it is optimizing memcpy. + TypePair boxTypePair{boxTy, llvmBoxTy}; + mlir::Value boxSize = + computeBoxSize(loc, boxTypePair, llvmValue, rewriter); + newOp = rewriter.create( + loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false); } else { newOp = rewriter.create(loc, llvmValue, llvmMemref); } diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index 7cdb2f7ffe27d..dd204126be5db 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -11,11 +11,13 @@ #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/CodeGen/Target.h" +#include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIRAttr.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Support/DataLayout.h" #include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/registration.h" @@ -84,6 +86,8 @@ struct CUFAddConstructor auto registeredMod = builder.create( loc, llvmPtrTy, mlir::SymbolRefAttr::get(ctx, gpuMod.getName())); + fir::LLVMTypeConverter typeConverter(mod, /*applyTBAA=*/false, + /*forceUnifiedTBAATree=*/false, *dl); // Register kernels for (auto func : gpuMod.getOps()) { if (func.isKernel()) { @@ -115,17 +119,25 @@ struct CUFAddConstructor fir::factory::createStringLiteral(builder, loc, gblNameStr)); // Global variable size - auto sizeAndAlign = fir::getTypeSizeAndAlignmentOrCrash( - loc, globalOp.getType(), *dl, kindMap); - auto size = - builder.createIntegerConstant(loc, idxTy, sizeAndAlign.first); + std::optional size; + if (auto boxTy = + mlir::dyn_cast(globalOp.getType())) { + mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy); + size = dl->getTypeSizeInBits(structTy) / 8; + } + if (!size) { + size = fir::getTypeSizeAndAlignmentOrCrash(loc, globalOp.getType(), + *dl, kindMap) + .first; + } + auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size); // Global variable address mlir::Value addr = builder.create( loc, globalOp.resultType(), globalOp.getSymbol()); llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, registeredMod, addr, gblName, size)}; + builder, loc, fTy, registeredMod, addr, gblName, sizeVal)}; builder.create(loc, func, args); } break; case cuf::DataAttribute::Managed: diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index dc39be8574f84..a69b47ff74391 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -14,6 +14,7 @@ #include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/allocatable.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/IR/SymbolTable.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" @@ -62,27 +63,26 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase { // Copying the device global variable into the gpu module mlir::SymbolTable parentSymTable(mod); - auto gpuMod = - parentSymTable.lookup(cudaDeviceModuleName); - if (gpuMod) { - mlir::SymbolTable gpuSymTable(gpuMod); - for (auto globalOp : mod.getOps()) { - auto attr = globalOp.getDataAttrAttr(); - if (!attr) - continue; - switch (attr.getValue()) { - case cuf::DataAttribute::Device: - case cuf::DataAttribute::Constant: - case cuf::DataAttribute::Managed: { - auto globalName{globalOp.getSymbol().getValue()}; - if (gpuSymTable.lookup(globalName)) { - break; - } - gpuSymTable.insert(globalOp->clone()); - } break; - default: + auto gpuMod = cuf::getOrCreateGPUModule(mod, parentSymTable); + if (!gpuMod) + return signalPassFailure(); + mlir::SymbolTable gpuSymTable(gpuMod); + for (auto globalOp : mod.getOps()) { + auto attr = globalOp.getDataAttrAttr(); + if (!attr) + continue; + switch (attr.getValue()) { + case cuf::DataAttribute::Device: + case cuf::DataAttribute::Constant: + case cuf::DataAttribute::Managed: { + auto globalName{globalOp.getSymbol().getValue()}; + if (gpuSymTable.lookup(globalName)) { break; } + gpuSymTable.insert(globalOp->clone()); + } break; + default: + break; } } } diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index f1f3a95b220df..4050064ebe95d 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -529,8 +529,8 @@ struct CUFDataTransferOpConversion mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); mlir::Value sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4)); - mlir::Value dst = builder.loadIfRef(loc, op.getDst()); - mlir::Value src = builder.loadIfRef(loc, op.getSrc()); + mlir::Value dst = op.getDst(); + mlir::Value src = op.getSrc(); llvm::SmallVector args{fir::runtime::createArguments( builder, loc, fTy, dst, src, modeValue, sourceFile, sourceLine)}; builder.create(loc, func, args); @@ -552,9 +552,8 @@ struct CUFDataTransferOpConversion mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); mlir::Value sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(3)); - mlir::Value dst = builder.loadIfRef(loc, op.getDst()); llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, dst, val, sourceFile, sourceLine)}; + builder, loc, fTy, op.getDst(), val, sourceFile, sourceLine)}; builder.create(loc, func, args); rewriter.eraseOp(op); } else { @@ -603,11 +602,8 @@ struct CUFDataTransferOpConversion mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); mlir::Value sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(5)); - mlir::Value dst = - dstIsDesc ? builder.loadIfRef(loc, op.getDst()) : op.getDst(); - mlir::Value src = mlir::isa(srcTy) - ? builder.loadIfRef(loc, op.getSrc()) - : op.getSrc(); + mlir::Value dst = op.getDst(); + mlir::Value src = op.getSrc(); llvm::SmallVector args{ fir::runtime::createArguments(builder, loc, fTy, dst, src, bytes, modeValue, sourceFile, sourceLine)}; diff --git a/flang/runtime/CUDA/memory.cpp b/flang/runtime/CUDA/memory.cpp index fc48b4343eea9..d03f1cc0e48d6 100644 --- a/flang/runtime/CUDA/memory.cpp +++ b/flang/runtime/CUDA/memory.cpp @@ -49,8 +49,8 @@ void RTDEF(CUFMemFree)( } } -void RTDEF(CUFMemsetDescriptor)(const Descriptor &desc, void *value, - const char *sourceFile, int sourceLine) { +void RTDEF(CUFMemsetDescriptor)( + Descriptor *desc, void *value, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; terminator.Crash("not yet implemented: CUDA data transfer from a scalar " "value to a descriptor"); @@ -73,23 +73,22 @@ void RTDEF(CUFDataTransferPtrPtr)(void *dst, void *src, std::size_t bytes, CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, bytes, kind)); } -void RTDEF(CUFDataTransferDescPtr)(const Descriptor &desc, void *addr, +void RTDEF(CUFDataTransferDescPtr)(Descriptor *desc, void *addr, std::size_t bytes, unsigned mode, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; terminator.Crash( "not yet implemented: CUDA data transfer from a pointer to a descriptor"); } -void RTDEF(CUFDataTransferPtrDesc)(void *addr, const Descriptor &desc, +void RTDEF(CUFDataTransferPtrDesc)(void *addr, Descriptor *desc, std::size_t bytes, unsigned mode, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; terminator.Crash( "not yet implemented: CUDA data transfer from a descriptor to a pointer"); } -void RTDECL(CUFDataTransferDescDesc)(const Descriptor &dstDesc, - const Descriptor &srcDesc, unsigned mode, const char *sourceFile, - int sourceLine) { +void RTDECL(CUFDataTransferDescDesc)(Descriptor *dstDesc, Descriptor *srcDesc, + unsigned mode, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; terminator.Crash( "not yet implemented: CUDA data transfer between two descriptors"); diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90 index 378dabbb7c7e7..99386abc4fafd 100644 --- a/flang/test/Fir/CUDA/cuda-constructor-2.f90 +++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90 @@ -3,6 +3,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { fir.global @_QMmtestsEn(dense<[3, 4, 5, 6, 7]> : tensor<5xi32>) {data_attr = #cuf.cuda} : !fir.array<5xi32> + fir.global @_QMmtestsEndev {data_attr = #cuf.cuda} : !fir.box>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap> + %1 = fircg.ext_embox %0(%c0) {allocator_idx = 2 : i32} : (!fir.heap>, index) -> !fir.box>> + fir.has_value %1 : !fir.box>> + } gpu.module @cuda_device_mod [#nvvm.target] { } @@ -18,5 +24,9 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>) -> !fir.ref // CHECK-DAG: %[[VAR_NAME2:.*]] = fir.convert %[[VAR_NAME]] : (!fir.ref>) -> !fir.ref // CHECK-DAG: %[[CST:.*]] = arith.constant 20 : index -// CHECK-DAG %[[CST2:.*]] = fir.convert %[[CST]] : (index) -> i64 -// CHECK fir.call @_FortranACUFRegisterVariable(%[[MODULE2]], %[[VAR_ADDR2]], %[[VAR_NAME2]], %[[CST2]]) : (!fir.ref>, !fir.ref, !fir.ref, i64) -> none +// CHECK-DAG: %[[CST2:.*]] = fir.convert %[[CST]] : (index) -> i64 +// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE2]], %[[VAR_ADDR2]], %[[VAR_NAME2]], %[[CST2]]) : (!fir.ref>, !fir.ref, !fir.ref, i64) -> none +// CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref>>> +// CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref>>>) -> !fir.ref +// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}}) +// diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir index c33c50115b9fc..cee3048e279cc 100644 --- a/flang/test/Fir/CUDA/cuda-data-transfer.fir +++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir @@ -15,11 +15,9 @@ func.func @_QPsub1() { // CHECK-LABEL: func.func @_QPsub1() // CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Eadev"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) // CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Eahost"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) -// CHECK: %[[AHOST_LOAD:.*]] = fir.load %[[AHOST]]#0 : !fir.ref>>> -// CHECK: %[[ADEV_LOAD:.*]] = fir.load %[[ADEV]]#0 : !fir.ref>>> -// CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[AHOST_LOAD]] : (!fir.box>>) -> !fir.box -// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV_LOAD]] : (!fir.box>>) -> !fir.box -// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[AHOST_BOX]], %[[ADEV_BOX]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.box, !fir.box, i32, !fir.ref, i32) -> none +// CHECK: %[[AHOST_BOX:.*]] = fir.convert %[[AHOST]]#0 : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> +// CHECK: fir.call @_FortranACUFDataTransferDescDesc(%[[AHOST_BOX]], %[[ADEV_BOX]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.ref>, i32, !fir.ref, i32) -> none func.func @_QPsub2() { %0 = cuf.alloc !fir.box>> {bindc_name = "adev", data_attr = #cuf.cuda, uniq_name = "_QFsub2Eadev"} -> !fir.ref>>> @@ -35,10 +33,9 @@ func.func @_QPsub2() { // CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub2Eadev"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) // CHECK: %[[C2:.*]] = arith.constant 2 : i32 // CHECK: fir.store %[[C2]] to %[[TEMP]] : !fir.ref -// CHECK: %[[ADEV_LOAD:.*]] = fir.load %[[ADEV]]#0 : !fir.ref>>> -// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV_LOAD]] : (!fir.box>>) -> !fir.box +// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[TEMP_CONV:.*]] = fir.convert %[[TEMP]] : (!fir.ref) -> !fir.llvm_ptr -// CHECK: fir.call @_FortranACUFMemsetDescriptor(%[[ADEV_BOX]], %[[TEMP_CONV]], %{{.*}}, %{{.*}}) : (!fir.box, !fir.llvm_ptr, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFMemsetDescriptor(%[[ADEV_BOX]], %[[TEMP_CONV]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.llvm_ptr, !fir.ref, i32) -> none func.func @_QPsub3() { %0 = cuf.alloc !fir.box>> {bindc_name = "adev", data_attr = #cuf.cuda, uniq_name = "_QFsub3Eadev"} -> !fir.ref>>> @@ -53,10 +50,9 @@ func.func @_QPsub3() { // CHECK-LABEL: func.func @_QPsub3() // CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub3Eadev"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) // CHECK: %[[V:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub3Ev"} : (!fir.ref) -> (!fir.ref, !fir.ref) -// CHECK: %[[ADEV_LOAD:.*]] = fir.load %[[ADEV]]#0 : !fir.ref>>> -// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV_LOAD]] : (!fir.box>>) -> !fir.box +// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[V_CONV:.*]] = fir.convert %[[V]]#0 : (!fir.ref) -> !fir.llvm_ptr -// CHECK: fir.call @_FortranACUFMemsetDescriptor(%[[ADEV_BOX]], %[[V_CONV]], %{{.*}}, %{{.*}}) : (!fir.box, !fir.llvm_ptr, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFMemsetDescriptor(%[[ADEV_BOX]], %[[V_CONV]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.llvm_ptr, !fir.ref, i32) -> none func.func @_QPsub4() { %0 = cuf.alloc !fir.box>> {bindc_name = "adev", data_attr = #cuf.cuda, uniq_name = "_QFsub4Eadev"} -> !fir.ref>>> @@ -76,19 +72,17 @@ func.func @_QPsub4() { // CHECK: %[[NBELEM:.*]] = arith.constant 10 : index // CHECK: %[[WIDTH:.*]] = arith.constant 4 : index // CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %[[WIDTH]] : index -// CHECK: %[[ADEV_LOAD:.*]] = fir.load %[[ADEV]]#0 : !fir.ref>>> -// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV_LOAD]] : (!fir.box>>) -> !fir.box +// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[AHOST_PTR:.*]] = fir.convert %[[AHOST]]#0 : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64 -// CHECK: fir.call @_FortranACUFDataTransferDescPtr(%[[ADEV_BOX]], %[[AHOST_PTR]], %[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.box, !fir.llvm_ptr, i64, i32, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFDataTransferDescPtr(%[[ADEV_BOX]], %[[AHOST_PTR]], %[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.llvm_ptr, i64, i32, !fir.ref, i32) -> none // CHECK: %[[NBELEM:.*]] = arith.constant 10 : index // CHECK: %[[WIDTH:.*]] = arith.constant 4 : index // CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %[[WIDTH]] : index -// CHECK: %[[ADEV_LOAD:.*]] = fir.load %[[ADEV]]#0 : !fir.ref>>> // CHECK: %[[AHOST_PTR:.*]] = fir.convert %[[AHOST]]#0 : (!fir.ref>) -> !fir.llvm_ptr -// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV_LOAD]] : (!fir.box>>) -> !fir.box +// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64 -// CHECK: fir.call @_FortranACUFDataTransferPtrDesc(%[[AHOST_PTR]], %[[ADEV_BOX]], %[[BYTES_CONV]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.box, i64, i32, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFDataTransferPtrDesc(%[[AHOST_PTR]], %[[ADEV_BOX]], %[[BYTES_CONV]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref>, i64, i32, !fir.ref, i32) -> none func.func @_QPsub5(%arg0: !fir.ref {fir.bindc_name = "n"}) { %0 = fir.dummy_scope : !fir.dscope @@ -122,19 +116,17 @@ func.func @_QPsub5(%arg0: !fir.ref {fir.bindc_name = "n"}) { // CHECK: %[[NBELEM:.*]] = arith.muli %[[I1]], %[[I2]] : index // CHECK: %[[WIDTH:.*]] = arith.constant 4 : index // CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %[[WIDTH]] : index -// CHECK: %[[ADEV_LOAD:.*]] = fir.load %[[ADEV]]#0 : !fir.ref>>> -// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV_LOAD]] : (!fir.box>>) -> !fir.box +// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[AHOST_PTR:.*]] = fir.convert %[[AHOST]]#1 : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64 -// CHECK: fir.call @_FortranACUFDataTransferDescPtr(%[[ADEV_BOX]], %[[AHOST_PTR]], %[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.box, !fir.llvm_ptr, i64, i32, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFDataTransferDescPtr(%[[ADEV_BOX]], %[[AHOST_PTR]], %[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.llvm_ptr, i64, i32, !fir.ref, i32) -> none // CHECK: %[[NBELEM:.*]] = arith.muli %[[I1]], %[[I2]] : index // CHECK: %[[WIDTH:.*]] = arith.constant 4 : index // CHECK: %[[BYTES:.*]] = arith.muli %[[NBELEM]], %[[WIDTH]] : index -// CHECK: %[[ADEV_LOAD:.*]] = fir.load %[[ADEV]]#0 : !fir.ref>>> // CHECK: %[[AHOST_PTR:.*]] = fir.convert %[[AHOST]]#1 : (!fir.ref>) -> !fir.llvm_ptr -// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV_LOAD]] : (!fir.box>>) -> !fir.box +// CHECK: %[[ADEV_BOX:.*]] = fir.convert %[[ADEV]]#0 : (!fir.ref>>>) -> !fir.ref> // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64 -// CHECK: fir.call @_FortranACUFDataTransferPtrDesc(%[[AHOST_PTR]], %[[ADEV_BOX]], %[[BYTES_CONV]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.box, i64, i32, !fir.ref, i32) -> none +// CHECK: fir.call @_FortranACUFDataTransferPtrDesc(%[[AHOST_PTR]], %[[ADEV_BOX]], %[[BYTES_CONV]], %c1{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref>, i64, i32, !fir.ref, i32) -> none func.func @_QPsub6() { %0 = cuf.alloc i32 {bindc_name = "idev", data_attr = #cuf.cuda, uniq_name = "_QFsub6Eidev"} -> !fir.ref diff --git a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 index 82a0c5948d9cb..18b56a491cd65 100644 --- a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 +++ b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 @@ -25,6 +25,9 @@ // Test that global used in device function are flagged with the correct // CHECK: fir.call @_FortranAioBeginExternalListOutput(%{{.*}}, %[[CONV]], %{{.*}}) fastmath : (i32, !fir.ref, i32) -> !fir.ref // CHECK: fir.global linkonce @_QQcl[[SYMBOL]] {data_attr = #cuf.cuda} constant : !fir.char<1,32> +// CHECK-LABEL: gpu.module @cuda_device_mod [#nvvm.target] +// CHECK: fir.global linkonce @_QQclX6995815537abaf90e86ce166af128f3a + // ----- func.func @_QMdataPsetvalue() { @@ -47,3 +50,6 @@ // Test that global used in device function are flagged with the correct // CHECK: %[[CONV:.*]] = fir.convert %[[GLOBAL]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAioBeginExternalListOutput(%{{.*}}, %[[CONV]], %{{.*}}) fastmath : (i32, !fir.ref, i32) -> !fir.ref // CHECK: fir.global linkonce @_QQcl[[SYMBOL]] constant : !fir.char<1,32> + +// CHECK-LABEL: gpu.module @cuda_device_mod [#nvvm.target] +// CHECK-NOT: fir.global linkonce @_QQclX6995815537abaf90e86ce166af128f3a diff --git a/flang/test/Fir/CUDA/cuda-register-func.fir b/flang/test/Fir/CUDA/cuda-register-func.fir index 6b0cbfd3aca63..25ab8dd786a4e 100644 --- a/flang/test/Fir/CUDA/cuda-register-func.fir +++ b/flang/test/Fir/CUDA/cuda-register-func.fir @@ -1,6 +1,6 @@ // RUN: fir-opt --cuf-add-constructor %s | FileCheck %s -module attributes {gpu.container_module} { +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { gpu.module @cuda_device_mod { gpu.func @_QPsub_device1() kernel { gpu.return diff --git a/flang/test/Fir/box.fir b/flang/test/Fir/box.fir index 81a4d8bc13bf0..fd9fa1f2b3aab 100644 --- a/flang/test/Fir/box.fir +++ b/flang/test/Fir/box.fir @@ -56,12 +56,14 @@ func.func @fa(%a : !fir.ref>) { // CHECK-LABEL: define void @b1( // CHECK-SAME: ptr %[[res:.*]], ptr %[[arg0:.*]], i64 %[[arg1:.*]]) func.func @b1(%arg0 : !fir.ref>, %arg1 : index) -> !fir.box> { + // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 } // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] // CHECK: insertvalue {{.*}} undef, i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %x = fir.embox %arg0 typeparams %arg1 : (!fir.ref>, index) -> !fir.box> - // CHECK: store {{.*}}, ptr %[[res]] + // CHECK: store {{.*}}, ptr %[[alloca]] + // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 24, i1 false) return %x : !fir.box> } @@ -71,11 +73,13 @@ func.func @b1(%arg0 : !fir.ref>, %arg1 : index) -> !fir.box>>, %arg1 : index) -> !fir.box>> { %1 = fir.shape %arg1 : (index) -> !fir.shape<1> + // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } // CHECK: insertvalue {{.*}} { ptr undef, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), i32 20240719, i8 1, i8 40, i8 0, i8 0, {{.*}} }, i64 %[[arg1]], 7, 0, 1 // CHECK: insertvalue {{.*}} %{{.*}}, i64 ptrtoint (ptr getelementptr ([5 x i8], ptr null, i32 1) to i64), 7, 0, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %2 = fir.embox %arg0(%1) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> - // CHECK: store {{.*}}, ptr %[[res]] + // CHECK: store {{.*}}, ptr %[[alloca]] + // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false) return %2 : !fir.box>> } @@ -84,6 +88,7 @@ func.func @b2(%arg0 : !fir.ref>>, %arg1 : index) -> // CHECK-SAME: ptr %[[res:.*]], ptr %[[arg0:.*]], i64 %[[arg1:.*]], i64 %[[arg2:.*]]) func.func @b3(%arg0 : !fir.ref>>, %arg1 : index, %arg2 : index) -> !fir.box>> { %1 = fir.shape %arg2 : (index) -> !fir.shape<1> + // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] // CHECK: insertvalue {{.*}} i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 @@ -91,7 +96,8 @@ func.func @b3(%arg0 : !fir.ref>>, %arg1 : index, %ar // CHECK: insertvalue {{.*}} i64 %[[size]], 7, 0, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %2 = fir.embox %arg0(%1) typeparams %arg1 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.box>> - // CHECK: store {{.*}}, ptr %[[res]] + // CHECK: store {{.*}}, ptr %[[alloca]] + // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false) return %2 : !fir.box>> } @@ -101,6 +107,7 @@ func.func @b3(%arg0 : !fir.ref>>, %arg1 : index, %ar func.func @b4(%arg0 : !fir.ref>>, %arg1 : index) -> !fir.box>> { %c_7 = arith.constant 7 : index %1 = fir.shape %c_7 : (index) -> !fir.shape<1> + // CHECK: %[[alloca:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } // CHECK: %[[size:.*]] = mul i64 ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64), %[[arg1]] // CHECK: insertvalue {{.*}} i64 %[[size]], 1 // CHECK: insertvalue {{.*}} i32 20240719, 2 @@ -108,7 +115,8 @@ func.func @b4(%arg0 : !fir.ref>>, %arg1 : index) -> // CHECK: insertvalue {{.*}} i64 %[[size]], 7, 0, 2 // CHECK: insertvalue {{.*}} ptr %[[arg0]], 0 %x = fir.embox %arg0(%1) typeparams %arg1 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.box>> - // CHECK: store {{.*}}, ptr %[[res]] + // CHECK: store {{.*}}, ptr %[[alloca]] + // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[res]], ptr %[[alloca]], i32 48, i1 false) return %x : !fir.box>> } @@ -117,8 +125,7 @@ func.func @b4(%arg0 : !fir.ref>>, %arg1 : index) -> // CHECK-SAME: ptr %[[arg0:.*]], ptr %[[arg1:.*]]) func.func @b5(%arg0 : !fir.ref>>>, %arg1 : !fir.box>>) { fir.store %arg1 to %arg0 : !fir.ref>>> - // CHECK: %[[boxLoad:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] }, ptr %[[arg1]] - // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [2 x [3 x i64]] } %[[boxLoad]], ptr %[[arg0]] + // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %0, ptr %1, i32 72, i1 false) return } diff --git a/flang/test/Fir/comdat-present.fir b/flang/test/Fir/comdat-present.fir new file mode 100644 index 0000000000000..96d14e5973f4f --- /dev/null +++ b/flang/test/Fir/comdat-present.fir @@ -0,0 +1,14 @@ +// RUN: fir-opt %s --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" | FileCheck %s +// RUN: fir-opt %s --fir-to-llvm-ir="target=x86_64-pc-windows-msvc" | FileCheck %s + +fir.global linkonce_odr @global_linkonce_odr constant : i32 { + %0 = arith.constant 0 : i32 + fir.has_value %0 : i32 +} + +llvm.comdat @__llvm_comdat { + llvm.comdat_selector @global_linkonce_odr any +} + +// CHECK-LABEL: llvm.comdat @__llvm_comdat +// CHECK: llvm.comdat_selector @global_linkonce_odr any diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir index 335877e7c9a87..168526518865b 100644 --- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir +++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir @@ -799,8 +799,8 @@ func.func @_QPs(%arg0: !fir.ref> {fir.bindc_name = "x"}) { //CHECK: omp.parallel { //CHECK: %[[CONST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 //CHECK: %[[ALLOCA_1:.*]] = llvm.alloca %[[CONST_1:.*]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr -//CHECK: %[[LOAD:.*]] = llvm.load %[[ALLOCA]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> -//CHECK: llvm.store %[[LOAD]], %[[ALLOCA_1]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>, !llvm.ptr +//CHECK: %[[SIZE:.*]] = llvm.mlir.constant(24 : i32) : i32 +//CHECK: "llvm.intr.memcpy"(%[[ALLOCA_1]], %[[ALLOCA]], %[[SIZE]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () //CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA_1]][0, 0] : (!llvm.ptr) -> !llvm.ptr //CHECK: %[[LOAD_2:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> !llvm.ptr //CHECK: omp.terminator diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index 1182a0a10f218..fa391fa6cc7a7 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -862,8 +862,8 @@ func.func @test_store_box(%array : !fir.ref>>, %box // CHECK-LABEL: llvm.func @test_store_box // CHECK-SAME: (%[[arg0:.*]]: !llvm.ptr, // CHECK-SAME: %[[arg1:.*]]: !llvm.ptr) { -// CHECK-NEXT: %[[box_to_store:.*]] = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i{{.*}}>>)> -// CHECK-NEXT: llvm.store %[[box_to_store]], %[[arg0]] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i{{.*}}>>)>, !llvm.ptr +// CHECK-NEXT: %[[size:.*]] = llvm.mlir.constant(72 : i32) : i32 +// CHECK-NEXT: "llvm.intr.memcpy"(%[[arg0]], %[[arg1]], %[[size]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () // CHECK-NEXT: llvm.return // CHECK-NEXT: } @@ -875,15 +875,17 @@ func.func @store_unlimited_polymorphic_box(%arg0 : !fir.class, %arg1 : !fi fir.store %arg3 to %arg3r : !fir.ref>> return } -// CHECK-LABEL: llvm.func @store_unlimited_polymorphic_box( -// CHECK: %[[VAL_8:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)> -// CHECK: llvm.store %[[VAL_8]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>, !llvm.ptr -// CHECK: %[[VAL_9:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)> -// CHECK: llvm.store %[[VAL_9]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)>, !llvm.ptr -// CHECK: %[[VAL_10:.*]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)> -// CHECK: llvm.store %[[VAL_10]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, ptr, array<1 x i{{.*}}>)>, !llvm.ptr -// CHECK: %[[VAL_11:.*]] = llvm.load %{{.*}}: !llvm.ptr -// CHECK: llvm.store %[[VAL_11]], %{{.*}} : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i{{.*}}>>, ptr, array<1 x i{{.*}}>)>, !llvm.ptr +// CHECK: llvm.func @store_unlimited_polymorphic_box(%[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: !llvm.ptr, %[[VAL_2:.*]]: !llvm.ptr, %[[VAL_3:.*]]: !llvm.ptr, %[[VAL_4:.*]]: !llvm.ptr, %[[VAL_5:.*]]: !llvm.ptr, %[[VAL_6:.*]]: !llvm.ptr, %[[VAL_7:.*]]: !llvm.ptr) { +// CHECK: %[[VAL_8:.*]] = llvm.mlir.constant(40 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[VAL_4]], %[[VAL_0]], %[[VAL_8]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(64 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[VAL_5]], %[[VAL_1]], %[[VAL_9]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(40 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[VAL_6]], %[[VAL_2]], %[[VAL_10]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: %[[VAL_11:.*]] = llvm.mlir.constant(64 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[VAL_7]], %[[VAL_3]], %[[VAL_11]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: llvm.return +// CHECK: } // ----- @@ -935,8 +937,8 @@ func.func @test_load_box(%addr : !fir.ref>>) { // GENERIC-NEXT: %[[box_copy:.*]] = llvm.alloca %[[c1]] x !llvm.struct<([[DESC_TYPE:.*]])> // AMDGPU-NEXT: %[[alloca_box_copy:.*]] = llvm.alloca %[[c1]] x !llvm.struct<([[DESC_TYPE:.*]])>{{.*}} : (i32) -> !llvm.ptr<5> // AMDGPU-NEXT: %[[box_copy:.*]] = llvm.addrspacecast %[[alloca_box_copy]] : !llvm.ptr<5> to !llvm.ptr -// CHECK-NEXT: %[[box_val:.*]] = llvm.load %[[arg0]] : !llvm.ptr -> !llvm.struct<([[DESC_TYPE]])> -// CHECK-NEXT: llvm.store %[[box_val]], %[[box_copy]] : !llvm.struct<([[DESC_TYPE]])>, !llvm.ptr +// CHECK-NEXT: %[[size:.*]] = llvm.mlir.constant(48 : i32) : i32 +// CHECK-NEXT: "llvm.intr.memcpy"(%[[box_copy]], %[[arg0]], %[[size]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () // CHECK-NEXT: llvm.call @takes_box(%[[box_copy]]) : (!llvm.ptr) -> () // CHECK-NEXT: llvm.return // CHECK-NEXT: } diff --git a/flang/test/Fir/embox-char.fir b/flang/test/Fir/embox-char.fir index bf8344dbb60fc..efb069f96520d 100644 --- a/flang/test/Fir/embox-char.fir +++ b/flang/test/Fir/embox-char.fir @@ -1,3 +1,10 @@ +// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py + +// The script is designed to make adding checks to +// a test case fast, it is *not* designed to be authoritative +// about what constitutes a good test! The CHECK should be +// minimized and named to reflect the test intent. + // Test that the offset of the first element of the slice // is computed in elements of the type used for the GEP // computing the base of the slice. @@ -10,42 +17,40 @@ // print *, x(2,:) // end subroutine -// CHECK-LABEL: llvm.func @test_char4( -// CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr, -// CHECK-SAME: %[[VAL_1_SLICE_LB0:.*]]: i64, %[[VAL_2_SLICE_EX0:.*]]: i64, %[[VAL_3_SLICE_ST0:.*]]: i64, %[[VAL_4_SLICE_LB1:.*]]: i64, %[[VAL_5_SLICE_EX1:.*]]: i64, %[[VAL_6_SLICE_ST1:.*]]: i64) { +// CHECK: llvm.func @test_char4(%[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[VAL_5:.*]]: i64, %[[VAL_6:.*]]: i64) { // CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_7]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr // CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_10:.*]] = llvm.alloca %[[VAL_9]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr // CHECK: %[[VAL_11:.*]] = llvm.mlir.constant(0 : index) : i64 // CHECK: %[[VAL_12:.*]] = llvm.mlir.constant(1 : index) : i64 -// CHECK: %[[VAL_13_WIDTH:.*]] = llvm.mlir.constant(4 : index) : i64 -// CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_0]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: llvm.store %[[VAL_14]], %[[VAL_10]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr +// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(4 : index) : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mlir.constant(72 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[VAL_10]], %[[VAL_0]], %[[VAL_14]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () // CHECK: %[[VAL_15:.*]] = llvm.getelementptr %[[VAL_10]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_16_BYTESIZE:.*]] = llvm.load %[[VAL_15]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_16:.*]] = llvm.load %[[VAL_15]] : !llvm.ptr -> i64 // CHECK: %[[VAL_17:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_18_LB1:.*]] = llvm.load %[[VAL_17]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_18:.*]] = llvm.load %[[VAL_17]] : !llvm.ptr -> i64 // CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_20_EX1:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_20:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> i64 // CHECK: %[[VAL_21:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_22_ST1:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_22:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i64 // CHECK: %[[VAL_23:.*]] = llvm.getelementptr %[[VAL_10]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_24_BASEPTR:.*]] = llvm.load %[[VAL_23]] : !llvm.ptr -> !llvm.ptr +// CHECK: %[[VAL_24:.*]] = llvm.load %[[VAL_23]] : !llvm.ptr -> !llvm.ptr // CHECK: %[[VAL_25:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_26_LB0:.*]] = llvm.load %[[VAL_25]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_26:.*]] = llvm.load %[[VAL_25]] : !llvm.ptr -> i64 // CHECK: %[[VAL_27:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_28_EX0:.*]] = llvm.load %[[VAL_27]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_28:.*]] = llvm.load %[[VAL_27]] : !llvm.ptr -> i64 // CHECK: %[[VAL_29:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_30_ST0:.*]] = llvm.load %[[VAL_29]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_31_LEN:.*]] = llvm.sdiv %[[VAL_16_BYTESIZE]], %[[VAL_13_WIDTH]] : i64 +// CHECK: %[[VAL_30:.*]] = llvm.load %[[VAL_29]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_31:.*]] = llvm.sdiv %[[VAL_16]], %[[VAL_13]] : i64 // CHECK: %[[VAL_32:.*]] = llvm.mlir.constant(44 : i32) : i32 // CHECK: %[[VAL_33:.*]] = llvm.mlir.zero : !llvm.ptr // CHECK: %[[VAL_34:.*]] = llvm.getelementptr %[[VAL_33]][1] : (!llvm.ptr) -> !llvm.ptr, i32 // CHECK: %[[VAL_35:.*]] = llvm.ptrtoint %[[VAL_34]] : !llvm.ptr to i64 -// CHECK: %[[VAL_36_BYTESIZE:.*]] = llvm.mul %[[VAL_35]], %[[VAL_31_LEN]] : i64 +// CHECK: %[[VAL_36:.*]] = llvm.mul %[[VAL_35]], %[[VAL_31]] : i64 // CHECK: %[[VAL_37:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36_BYTESIZE]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> // CHECK: %[[VAL_39:.*]] = llvm.mlir.constant(20240719 : i32) : i32 // CHECK: %[[VAL_40:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_38]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> // CHECK: %[[VAL_41:.*]] = llvm.mlir.constant(2 : i32) : i32 @@ -59,39 +64,39 @@ // CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_50:.*]] = llvm.trunc %[[VAL_49]] : i32 to i8 // CHECK: %[[VAL_51:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_48]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_52_c0:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[VAL_52:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[VAL_53:.*]] = llvm.mlir.constant(1 : i64) : i64 -// CHECK: %[[VAL_54:.*]] = llvm.sub %[[VAL_1_SLICE_LB0]], %[[VAL_26_LB0]] : i64 -// CHECK: %[[VAL_55:.*]] = llvm.mul %[[VAL_54]], %[[VAL_31_LEN]] : i64 -// CHECK: %[[VAL_56_SLICE_OFF0:.*]] = llvm.add %[[VAL_55]], %[[VAL_52_c0]] : i64 -// CHECK: %[[VAL_57:.*]] = llvm.sub %[[VAL_2_SLICE_EX0]], %[[VAL_1_SLICE_LB0]] : i64 -// CHECK: %[[VAL_58:.*]] = llvm.add %[[VAL_57]], %[[VAL_3_SLICE_ST0]] : i64 -// CHECK: %[[VAL_59:.*]] = llvm.sdiv %[[VAL_58]], %[[VAL_3_SLICE_ST0]] : i64 -// CHECK: %[[VAL_60:.*]] = llvm.icmp "sgt" %[[VAL_59]], %[[VAL_52_c0]] : i64 -// CHECK: %[[VAL_61:.*]] = llvm.select %[[VAL_60]], %[[VAL_59]], %[[VAL_52_c0]] : i1, i64 +// CHECK: %[[VAL_54:.*]] = llvm.sub %[[VAL_1]], %[[VAL_26]] : i64 +// CHECK: %[[VAL_55:.*]] = llvm.mul %[[VAL_54]], %[[VAL_31]] : i64 +// CHECK: %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[VAL_52]] : i64 +// CHECK: %[[VAL_57:.*]] = llvm.sub %[[VAL_2]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_58:.*]] = llvm.add %[[VAL_57]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_59:.*]] = llvm.sdiv %[[VAL_58]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_60:.*]] = llvm.icmp "sgt" %[[VAL_59]], %[[VAL_52]] : i64 +// CHECK: %[[VAL_61:.*]] = llvm.select %[[VAL_60]], %[[VAL_59]], %[[VAL_52]] : i1, i64 // CHECK: %[[VAL_62:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_51]][7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> // CHECK: %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_62]][7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_64:.*]] = llvm.mul %[[VAL_36_BYTESIZE]], %[[VAL_3_SLICE_ST0]] : i64 +// CHECK: %[[VAL_64:.*]] = llvm.mul %[[VAL_36]], %[[VAL_3]] : i64 // CHECK: %[[VAL_65:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_63]][7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_66:.*]] = llvm.mul %[[VAL_36_BYTESIZE]], %[[VAL_28_EX0]] : i64 -// CHECK: %[[VAL_67:.*]] = llvm.mul %[[VAL_31_LEN]], %[[VAL_28_EX0]] : i64 -// CHECK: %[[VAL_68:.*]] = llvm.sub %[[VAL_4_SLICE_LB1]], %[[VAL_18_LB1]] : i64 -// CHECK: %[[VAL_69_SLICE_OFF1:.*]] = llvm.mul %[[VAL_68]], %[[VAL_67]] : i64 -// CHECK: %[[VAL_70_OFFSET:.*]] = llvm.add %[[VAL_69_SLICE_OFF1]], %[[VAL_56_SLICE_OFF0]] : i64 -// CHECK: %[[VAL_71:.*]] = llvm.sub %[[VAL_5_SLICE_EX1]], %[[VAL_4_SLICE_LB1]] : i64 -// CHECK: %[[VAL_72:.*]] = llvm.add %[[VAL_71]], %[[VAL_6_SLICE_ST1]] : i64 -// CHECK: %[[VAL_73:.*]] = llvm.sdiv %[[VAL_72]], %[[VAL_6_SLICE_ST1]] : i64 -// CHECK: %[[VAL_74:.*]] = llvm.icmp "sgt" %[[VAL_73]], %[[VAL_52_c0]] : i64 -// CHECK: %[[VAL_75:.*]] = llvm.select %[[VAL_74]], %[[VAL_73]], %[[VAL_52_c0]] : i1, i64 +// CHECK: %[[VAL_66:.*]] = llvm.mul %[[VAL_36]], %[[VAL_28]] : i64 +// CHECK: %[[VAL_67:.*]] = llvm.mul %[[VAL_31]], %[[VAL_28]] : i64 +// CHECK: %[[VAL_68:.*]] = llvm.sub %[[VAL_4]], %[[VAL_18]] : i64 +// CHECK: %[[VAL_69:.*]] = llvm.mul %[[VAL_68]], %[[VAL_67]] : i64 +// CHECK: %[[VAL_70:.*]] = llvm.add %[[VAL_69]], %[[VAL_56]] : i64 +// CHECK: %[[VAL_71:.*]] = llvm.sub %[[VAL_5]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_72:.*]] = llvm.add %[[VAL_71]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_73:.*]] = llvm.sdiv %[[VAL_72]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_74:.*]] = llvm.icmp "sgt" %[[VAL_73]], %[[VAL_52]] : i64 +// CHECK: %[[VAL_75:.*]] = llvm.select %[[VAL_74]], %[[VAL_73]], %[[VAL_52]] : i1, i64 // CHECK: %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_65]][7, 1, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> // CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_75]], %[[VAL_76]][7, 1, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_78:.*]] = llvm.mul %[[VAL_66]], %[[VAL_6_SLICE_ST1]] : i64 +// CHECK: %[[VAL_78:.*]] = llvm.mul %[[VAL_66]], %[[VAL_6]] : i64 // CHECK: %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_78]], %[[VAL_77]][7, 1, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_80:.*]] = llvm.mul %[[VAL_66]], %[[VAL_20_EX1]] : i64 -// CHECK: %[[VAL_81:.*]] = llvm.mul %[[VAL_67]], %[[VAL_20_EX1]] : i64 -// CHECK: %[[VAL_82:.*]] = llvm.getelementptr %[[VAL_24_BASEPTR]]{{\[}}%[[VAL_70_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 -// CHECK: %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_82]], %[[VAL_79]][0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: llvm.store %[[VAL_84]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr +// CHECK: %[[VAL_80:.*]] = llvm.mul %[[VAL_66]], %[[VAL_20]] : i64 +// CHECK: %[[VAL_81:.*]] = llvm.mul %[[VAL_67]], %[[VAL_20]] : i64 +// CHECK: %[[VAL_82:.*]] = llvm.getelementptr %[[VAL_24]]{{\[}}%[[VAL_70]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 +// CHECK: %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_82]], %[[VAL_79]][0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_83]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr // CHECK: llvm.return // CHECK: } func.func @test_char4(%arg0: !fir.ref>>>>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) { @@ -108,86 +113,84 @@ func.func @test_char4(%arg0: !fir.ref>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr // CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_10:.*]] = llvm.alloca %[[VAL_9]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr // CHECK: %[[VAL_11:.*]] = llvm.mlir.constant(0 : index) : i64 -// CHECK: %[[VAL_12_c1:.*]] = llvm.mlir.constant(1 : index) : i64 -// CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_0]] : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: llvm.store %[[VAL_14]], %[[VAL_10]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr -// CHECK: %[[VAL_15:.*]] = llvm.getelementptr %[[VAL_10]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_16_BYTESIZE:.*]] = llvm.load %[[VAL_15]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_17:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_18_LB1:.*]] = llvm.load %[[VAL_17]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_19:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_20_EX1:.*]] = llvm.load %[[VAL_19]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_21:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_22_ST1:.*]] = llvm.load %[[VAL_21]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_23:.*]] = llvm.getelementptr %[[VAL_10]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_24_BASEPTR:.*]] = llvm.load %[[VAL_23]] : !llvm.ptr -> !llvm.ptr -// CHECK: %[[VAL_25:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_26_LB0:.*]] = llvm.load %[[VAL_25]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_27:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_28_EX0:.*]] = llvm.load %[[VAL_27]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_29:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_30_ST0:.*]] = llvm.load %[[VAL_29]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_32:.*]] = llvm.mlir.constant(40 : i32) : i32 -// CHECK: %[[VAL_33:.*]] = llvm.mlir.zero : !llvm.ptr -// CHECK: %[[VAL_34:.*]] = llvm.getelementptr %[[VAL_33]][1] : (!llvm.ptr) -> !llvm.ptr, i8 -// CHECK: %[[VAL_35:.*]] = llvm.ptrtoint %[[VAL_34]] : !llvm.ptr to i64 -// CHECK: %[[VAL_36_BYTESIZE:.*]] = llvm.mul %[[VAL_35]], %[[VAL_16_BYTESIZE]] : i64 -// CHECK: %[[VAL_37:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36_BYTESIZE]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_39:.*]] = llvm.mlir.constant(20240719 : i32) : i32 -// CHECK: %[[VAL_40:.*]] = llvm.insertvalue %[[VAL_39]], %[[VAL_38]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_41:.*]] = llvm.mlir.constant(2 : i32) : i32 -// CHECK: %[[VAL_42:.*]] = llvm.trunc %[[VAL_41]] : i32 to i8 -// CHECK: %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_40]][3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_44:.*]] = llvm.trunc %[[VAL_32]] : i32 to i8 -// CHECK: %[[VAL_45:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_43]][4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_46:.*]] = llvm.mlir.constant(0 : i32) : i32 -// CHECK: %[[VAL_47:.*]] = llvm.trunc %[[VAL_46]] : i32 to i8 -// CHECK: %[[VAL_48:.*]] = llvm.insertvalue %[[VAL_47]], %[[VAL_45]][5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_49:.*]] = llvm.mlir.constant(0 : i32) : i32 -// CHECK: %[[VAL_50:.*]] = llvm.trunc %[[VAL_49]] : i32 to i8 -// CHECK: %[[VAL_51:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_48]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_52_c0:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[VAL_53:.*]] = llvm.mlir.constant(1 : i64) : i64 -// CHECK: %[[VAL_54:.*]] = llvm.sub %[[VAL_1_SLICE_LB0]], %[[VAL_26_LB0]] : i64 -// CHECK: %[[VAL_55:.*]] = llvm.mul %[[VAL_54]], %[[VAL_16_BYTESIZE]] : i64 -// CHECK: %[[VAL_56_SLICE_OFF0:.*]] = llvm.add %[[VAL_55]], %[[VAL_52_c0]] : i64 -// CHECK: %[[VAL_57:.*]] = llvm.sub %[[VAL_2_SLICE_EX0]], %[[VAL_1_SLICE_LB0]] : i64 -// CHECK: %[[VAL_58:.*]] = llvm.add %[[VAL_57]], %[[VAL_3_SLICE_ST0]] : i64 -// CHECK: %[[VAL_59:.*]] = llvm.sdiv %[[VAL_58]], %[[VAL_3_SLICE_ST0]] : i64 -// CHECK: %[[VAL_60:.*]] = llvm.icmp "sgt" %[[VAL_59]], %[[VAL_52_c0]] : i64 -// CHECK: %[[VAL_61:.*]] = llvm.select %[[VAL_60]], %[[VAL_59]], %[[VAL_52_c0]] : i1, i64 -// CHECK: %[[VAL_62:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_51]][7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_61]], %[[VAL_62]][7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_64:.*]] = llvm.mul %[[VAL_36_BYTESIZE]], %[[VAL_3_SLICE_ST0]] : i64 -// CHECK: %[[VAL_65:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_63]][7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_66:.*]] = llvm.mul %[[VAL_36_BYTESIZE]], %[[VAL_28_EX0]] : i64 -// CHECK: %[[VAL_67:.*]] = llvm.mul %[[VAL_16_BYTESIZE]], %[[VAL_28_EX0]] : i64 -// CHECK: %[[VAL_68:.*]] = llvm.sub %[[VAL_4_SLICE_LB1]], %[[VAL_18_LB1]] : i64 -// CHECK: %[[VAL_69_SLICE_OFF1:.*]] = llvm.mul %[[VAL_68]], %[[VAL_67]] : i64 -// CHECK: %[[VAL_70_OFFSET:.*]] = llvm.add %[[VAL_69_SLICE_OFF1]], %[[VAL_56_SLICE_OFF0]] : i64 -// CHECK: %[[VAL_71:.*]] = llvm.sub %[[VAL_5_SLICE_EX1]], %[[VAL_4_SLICE_LB1]] : i64 -// CHECK: %[[VAL_72:.*]] = llvm.add %[[VAL_71]], %[[VAL_6_SLICE_ST1]] : i64 -// CHECK: %[[VAL_73:.*]] = llvm.sdiv %[[VAL_72]], %[[VAL_6_SLICE_ST1]] : i64 -// CHECK: %[[VAL_74:.*]] = llvm.icmp "sgt" %[[VAL_73]], %[[VAL_52_c0]] : i64 -// CHECK: %[[VAL_75:.*]] = llvm.select %[[VAL_74]], %[[VAL_73]], %[[VAL_52_c0]] : i1, i64 -// CHECK: %[[VAL_76:.*]] = llvm.insertvalue %[[VAL_53]], %[[VAL_65]][7, 1, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_75]], %[[VAL_76]][7, 1, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_78:.*]] = llvm.mul %[[VAL_66]], %[[VAL_6_SLICE_ST1]] : i64 -// CHECK: %[[VAL_79:.*]] = llvm.insertvalue %[[VAL_78]], %[[VAL_77]][7, 1, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: %[[VAL_80:.*]] = llvm.mul %[[VAL_66]], %[[VAL_20_EX1]] : i64 -// CHECK: %[[VAL_81:.*]] = llvm.mul %[[VAL_67]], %[[VAL_20_EX1]] : i64 -// CHECK: %[[VAL_82:.*]] = llvm.getelementptr %[[VAL_24_BASEPTR]]{{\[}}%[[VAL_70_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 -// CHECK: %[[VAL_84:.*]] = llvm.insertvalue %[[VAL_82]], %[[VAL_79]][0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> -// CHECK: llvm.store %[[VAL_84]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr +// CHECK: %[[VAL_12:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[VAL_13:.*]] = llvm.mlir.constant(72 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[VAL_10]], %[[VAL_0]], %[[VAL_13]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () +// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_10]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_16:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_17:.*]] = llvm.load %[[VAL_16]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_18:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_19:.*]] = llvm.load %[[VAL_18]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_20:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_12]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_21:.*]] = llvm.load %[[VAL_20]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_22:.*]] = llvm.getelementptr %[[VAL_10]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_23:.*]] = llvm.load %[[VAL_22]] : !llvm.ptr -> !llvm.ptr +// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_26]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_28:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_29:.*]] = llvm.load %[[VAL_28]] : !llvm.ptr -> i64 +// CHECK: %[[VAL_30:.*]] = llvm.mlir.constant(40 : i32) : i32 +// CHECK: %[[VAL_31:.*]] = llvm.mlir.zero : !llvm.ptr +// CHECK: %[[VAL_32:.*]] = llvm.getelementptr %[[VAL_31]][1] : (!llvm.ptr) -> !llvm.ptr, i8 +// CHECK: %[[VAL_33:.*]] = llvm.ptrtoint %[[VAL_32]] : !llvm.ptr to i64 +// CHECK: %[[VAL_34:.*]] = llvm.mul %[[VAL_33]], %[[VAL_15]] : i64 +// CHECK: %[[VAL_35:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_36:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_35]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_37:.*]] = llvm.mlir.constant(20240719 : i32) : i32 +// CHECK: %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_37]], %[[VAL_36]][2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_39:.*]] = llvm.mlir.constant(2 : i32) : i32 +// CHECK: %[[VAL_40:.*]] = llvm.trunc %[[VAL_39]] : i32 to i8 +// CHECK: %[[VAL_41:.*]] = llvm.insertvalue %[[VAL_40]], %[[VAL_38]][3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_42:.*]] = llvm.trunc %[[VAL_30]] : i32 to i8 +// CHECK: %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_42]], %[[VAL_41]][4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_44:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_45:.*]] = llvm.trunc %[[VAL_44]] : i32 to i8 +// CHECK: %[[VAL_46:.*]] = llvm.insertvalue %[[VAL_45]], %[[VAL_43]][5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_47:.*]] = llvm.mlir.constant(0 : i32) : i32 +// CHECK: %[[VAL_48:.*]] = llvm.trunc %[[VAL_47]] : i32 to i8 +// CHECK: %[[VAL_49:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_46]][6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_50:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[VAL_51:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[VAL_52:.*]] = llvm.sub %[[VAL_1]], %[[VAL_25]] : i64 +// CHECK: %[[VAL_53:.*]] = llvm.mul %[[VAL_52]], %[[VAL_15]] : i64 +// CHECK: %[[VAL_54:.*]] = llvm.add %[[VAL_53]], %[[VAL_50]] : i64 +// CHECK: %[[VAL_55:.*]] = llvm.sub %[[VAL_2]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_57:.*]] = llvm.sdiv %[[VAL_56]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_58:.*]] = llvm.icmp "sgt" %[[VAL_57]], %[[VAL_50]] : i64 +// CHECK: %[[VAL_59:.*]] = llvm.select %[[VAL_58]], %[[VAL_57]], %[[VAL_50]] : i1, i64 +// CHECK: %[[VAL_60:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_49]][7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_61:.*]] = llvm.insertvalue %[[VAL_59]], %[[VAL_60]][7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_62:.*]] = llvm.mul %[[VAL_34]], %[[VAL_3]] : i64 +// CHECK: %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_61]][7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_64:.*]] = llvm.mul %[[VAL_34]], %[[VAL_27]] : i64 +// CHECK: %[[VAL_65:.*]] = llvm.mul %[[VAL_15]], %[[VAL_27]] : i64 +// CHECK: %[[VAL_66:.*]] = llvm.sub %[[VAL_4]], %[[VAL_17]] : i64 +// CHECK: %[[VAL_67:.*]] = llvm.mul %[[VAL_66]], %[[VAL_65]] : i64 +// CHECK: %[[VAL_68:.*]] = llvm.add %[[VAL_67]], %[[VAL_54]] : i64 +// CHECK: %[[VAL_69:.*]] = llvm.sub %[[VAL_5]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_70:.*]] = llvm.add %[[VAL_69]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_71:.*]] = llvm.sdiv %[[VAL_70]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_72:.*]] = llvm.icmp "sgt" %[[VAL_71]], %[[VAL_50]] : i64 +// CHECK: %[[VAL_73:.*]] = llvm.select %[[VAL_72]], %[[VAL_71]], %[[VAL_50]] : i1, i64 +// CHECK: %[[VAL_74:.*]] = llvm.insertvalue %[[VAL_51]], %[[VAL_63]][7, 1, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_73]], %[[VAL_74]][7, 1, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_76:.*]] = llvm.mul %[[VAL_64]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_76]], %[[VAL_75]][7, 1, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: %[[VAL_78:.*]] = llvm.mul %[[VAL_64]], %[[VAL_19]] : i64 +// CHECK: %[[VAL_79:.*]] = llvm.mul %[[VAL_65]], %[[VAL_19]] : i64 +// CHECK: %[[VAL_80:.*]] = llvm.getelementptr %[[VAL_23]]{{\[}}%[[VAL_68]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 +// CHECK: %[[VAL_81:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_77]][0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> +// CHECK: llvm.store %[[VAL_81]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>, !llvm.ptr // CHECK: llvm.return // CHECK: } func.func @test_char1(%arg0: !fir.ref>>>>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) { diff --git a/flang/test/Fir/polymorphic.fir b/flang/test/Fir/polymorphic.fir index 40204314e8df7..78e5b8dcf84c7 100644 --- a/flang/test/Fir/polymorphic.fir +++ b/flang/test/Fir/polymorphic.fir @@ -14,8 +14,7 @@ func.func @_QMpolymorphic_testPtest_allocate_unlimited_polymorphic_non_derived() // CHECK: %[[MEM:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 0, i32 20240719, i8 0, i8 -1, i8 1, i8 1, ptr null, [1 x i64] zeroinitializer }, ptr %[[MEM]] -// CHECK: %[[LOADED:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[MEM]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOADED]], ptr %[[DESC]] +// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[DESC]], ptr %[[MEM]], i32 40, i1 false) // CHECK: ret void // CHECK: } @@ -66,8 +65,7 @@ func.func @_QMpolymorphic_testPtest_embox() { // CHECK-LABEL: @_QMpolymorphic_testPtest_embox() // CHECK: %[[ALLOCA_DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } { ptr @_QFEy, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, {{.*}}, ptr %[[ALLOCA_DESC]] -// CHECK: %[[LOADED_DESC:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_DESC]], align 8 -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[LOADED_DESC]], ptr @_QFEx, align 8 +// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr @_QFEx, ptr %[[ALLOCA_DESC]], i32 64, i1 false) // Test emboxing of an array element from an unlimited polymorphic array. @@ -158,8 +156,7 @@ func.func @_QQmain() { // CHECK: %[[CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } // CHECK: %[[DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1 // CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr @_QMmod1Ea, i64 ptrtoint (ptr getelementptr (%_QMmod1TtK2, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 1, i8 1, ptr @_QMmod1EXdtXtX2, [1 x i64] zeroinitializer }, ptr %[[CLASS_NONE]], align 8 -// CHECK: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[CLASS_NONE]] -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[DESC]] +// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[DESC]], ptr %[[CLASS_NONE]], i32 40, i1 false) // CHECK: call void @_QMmod1Psub1(ptr %[[DESC]]) fir.global @_QMmod2Ep : !fir.class> { @@ -180,8 +177,7 @@ func.func private @_FortranAPointerAssociate(!fir.ref>, !fir.box< // CHECK-LABEL: define void @_QMmod2Pinitp( // CHECK-SAME: ptr %[[ARG0:.*]]){{.*}}{ // CHECK: %[[ALLOCA_CLASS_NONE:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } -// CHECK: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[ARG0]] -// CHECK: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[ALLOCA_CLASS_NONE]] +// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA_CLASS_NONE]], ptr %[[ARG0]], i32 40, i1 false) // CHECK: %{{.*}} = call {} @_FortranAPointerAssociate(ptr @_QMmod2Ep, ptr %[[ALLOCA_CLASS_NONE]]) // CHECK: ret void diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir index 809ab3a922a0f..401ebbc8c49fe 100644 --- a/flang/test/Fir/tbaa.fir +++ b/flang/test/Fir/tbaa.fir @@ -137,8 +137,8 @@ module { // CHECK: %[[VAL_7:.*]] = llvm.mlir.addressof @_QFEx : !llvm.ptr // CHECK: %[[VAL_8:.*]] = llvm.mlir.addressof @_QQclX2E2F64756D6D792E66393000 : !llvm.ptr // CHECK: %[[VAL_10:.*]] = llvm.call @_FortranAioBeginExternalListOutput(%[[VAL_6]], %[[VAL_8]], %[[VAL_5]]) {fastmathFlags = #llvm.fastmath} : (i32, !llvm.ptr, i32) -> !llvm.ptr -// CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_7]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> -// CHECK: llvm.store %[[VAL_11]], %[[VAL_3]] {tbaa = [#[[$BOXT]]]} : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>, !llvm.ptr +// CHECK: %[[VAL_11:.*]] = llvm.mlir.constant(64 : i32) : i32 +// CHECK: "llvm.intr.memcpy"(%[[VAL_3]], %[[VAL_7]], %[[VAL_11]]) <{isVolatile = false, tbaa = [#[[$BOXT]]]}> // CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[VAL_3]][0, 7, %[[VAL_4]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> // CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i64 // CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_3]][0, 7, %[[VAL_4]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> diff --git a/flang/test/Fir/tco-default-datalayout.fir b/flang/test/Fir/tco-default-datalayout.fir index 0741e820a8d19..c6a4ddb46853f 100644 --- a/flang/test/Fir/tco-default-datalayout.fir +++ b/flang/test/Fir/tco-default-datalayout.fir @@ -7,6 +7,6 @@ module { // CHECK: module attributes { // CHECK-SAME: dlti.dl_spec = #dlti.dl_spec< // ... -// CHECK-SAME: #dlti.dl_entry : vector<2xi64>>, +// CHECK-SAME: i64 = dense<[32, 64]> : vector<2xi64>, // ... // CHECK-SAME: llvm.data_layout = "" diff --git a/flang/test/Fir/tco-explicit-datalayout.fir b/flang/test/Fir/tco-explicit-datalayout.fir index 50d8d835a602f..cae500a948aa5 100644 --- a/flang/test/Fir/tco-explicit-datalayout.fir +++ b/flang/test/Fir/tco-explicit-datalayout.fir @@ -8,6 +8,6 @@ module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i6 // CHECK: module attributes { // CHECK-SAME: dlti.dl_spec = #dlti.dl_spec< // ... -// CHECK-SAME: #dlti.dl_entry : vector<2xi64>>, +// CHECK-SAME: i64 = dense<128> : vector<2xi64>, // ... // CHECK-SAME: llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:128-i128:128-f80:128-n8:16:32:64-S128" diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90 index 62d0a3faf0c59..63ac6fbe05ee0 100644 --- a/flang/test/Integration/OpenMP/private-global.f90 +++ b/flang/test/Integration/OpenMP/private-global.f90 @@ -31,8 +31,9 @@ program bug ! CHECK: %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8 ! CHECK: %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0 ! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8 -! CHECK: %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8 -! CHECK: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8 +! CHECK : %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8 +! CHECK : store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8 +! CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[TABLE_BOX_ADDR2]], ptr %[[TABLE_BOX_ADDR]], i32 48, i1 false) ! CHECK: %[[VAL_26:.*]] = call {} @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9) ! ... ! check that we use the private copy of table for table/=50 diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 index 9c97c689dad70..b3a668018df1d 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 @@ -57,5 +57,4 @@ end program compilation_to_obj ! LLVM: @[[GLOB_VAR:[^[:space:]]+]]t = internal global ! LLVM: define internal void @_QQmain..omp_par -! LLVM: %[[GLOB_VAL:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr @[[GLOB_VAR]]t, align 8 -! LLVM-NEXT: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[GLOB_VAL]], ptr %{{.*}}, align 8 +! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %{{.+}}, ptr @[[GLOB_VAR]]t, i32 48, i1 false) diff --git a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 index 262075ec9b25d..8e6f55abd5671 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-mixed.f90 @@ -17,7 +17,7 @@ subroutine proc end subroutine proc !CHECK-LABEL: define void @proc_() -!CHECK: call void +!CHECK: call void (ptr, i32, ptr, ...) !CHECK-SAME: @__kmpc_fork_call(ptr {{.*}}, i32 1, ptr @[[OMP_PAR:.*]], {{.*}}) !CHECK: define internal void @[[OMP_PAR]](ptr {{.*}} %[[TID_ADDR:.*]], ptr noalias diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90 index e23e38ffb4b01..4d70e1ea4c739 100644 --- a/flang/test/Lower/allocatable-polymorphic.f90 +++ b/flang/test/Lower/allocatable-polymorphic.f90 @@ -603,10 +603,9 @@ program test_alloc ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) -! LLVM-COUNT-2: call void %{{.*}}() +! LLVM-COUNT-2: call void %{{[0-9]*}}() -! LLVM: %[[C1_LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}} -! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[C1_LOAD]], ptr %{{.*}} +! LLVM: call void @llvm.memcpy.p0.p0.i32 ! LLVM: %[[GEP_TDESC_C1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7 ! LLVM: %[[TDESC_C1:.*]] = load ptr, ptr %[[GEP_TDESC_C1]] ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1 @@ -620,8 +619,7 @@ program test_alloc ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %{{.*}}, ptr %[[TMP:.*]] ! LLVM: call void %{{.*}}(ptr %{{.*}}) -! LLVM: %[[LOAD_C2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}} -! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD_C2]], ptr %{{.*}} +! LLVM: call void @llvm.memcpy.p0.p0.i32 ! LLVM: %[[GEP_TDESC_C2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 7 ! LLVM: %[[TDESC_C2:.*]] = load ptr, ptr %[[GEP_TDESC_C2]] ! LLVM: %[[ELEM_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1 @@ -635,9 +633,7 @@ program test_alloc ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %{{.*}}, ptr %{{.*}} ! LLVM: call void %{{.*}}(ptr %{{.*}}) -! LLVM: %[[C3_LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}} -! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[C3_LOAD]], ptr %{{.*}} - +! LLVM: call void @llvm.memcpy.p0.p0.i32 ! LLVM: %[[GEP_TDESC_C3:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 8 ! LLVM: %[[TDESC_C3:.*]] = load ptr, ptr %[[GEP_TDESC_C3]] ! LLVM: %[[ELE_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1 @@ -658,8 +654,7 @@ program test_alloc ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[BOX7]], ptr %{{.*}} ! LLVM: call void %{{.*}}(ptr %{{.*}}) -! LLVM: %[[C4_LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}} -! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] } %[[C4_LOAD]], ptr %{{.*}} +! LLVM: call void @llvm.memcpy.p0.p0.i32 ! LLVM: %[[GEP_TDESC_C4:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 8 ! LLVM: %[[TDESC_C4:.*]] = load ptr, ptr %[[GEP_TDESC_C4]] ! LLVM: %[[ELE_SIZE_GEP:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %{{.*}}, i32 0, i32 1 @@ -686,8 +681,7 @@ program test_alloc ! LLVM-LABEL: define void @_QMpolyPtest_deallocate() ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 ptrtoint (ptr getelementptr (%_QMpolyTp1, ptr null, i32 1) to i64), i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]] -! LLVM: %[[LOAD:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, ptr %[[ALLOCA1]] -! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } %[[LOAD]], ptr %[[ALLOCA2:[0-9]*]] +! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA2:[0-9]+]], ptr %[[ALLOCA1]], i32 40, i1 false) ! LLVM: %{{.*}} = call {} @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableDeallocatePolymorphic(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}}) diff --git a/flang/test/Lower/do_concurrent.f90 b/flang/test/Lower/do_concurrent.f90 new file mode 100644 index 0000000000000..ef93d2d6b035b --- /dev/null +++ b/flang/test/Lower/do_concurrent.f90 @@ -0,0 +1,102 @@ +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s + +! Simple tests for structured concurrent loops with loop-control. + +pure function bar(n, m) + implicit none + integer, intent(in) :: n, m + integer :: bar + bar = n + m +end function + +!CHECK-LABEL: sub1 +subroutine sub1(n) + implicit none + integer :: n, m, i, j, k + integer, dimension(n) :: a +!CHECK: %[[LB1:.*]] = arith.constant 1 : i32 +!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index +!CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref +!CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index + +!CHECK: %[[LB2:.*]] = arith.constant 1 : i32 +!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index +!CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs fastmath : (!fir.ref, !fir.ref) -> i32 +!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index + +!CHECK: %[[LB3:.*]] = arith.constant 5 : i32 +!CHECK: %[[LB3_CVT:.*]] = fir.convert %[[LB3]] : (i32) -> index +!CHECK: %[[UB3:.*]] = arith.constant 10 : i32 +!CHECK: %[[UB3_CVT:.*]] = fir.convert %[[UB3]] : (i32) -> index + +!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered +!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered +!CHECK: fir.do_loop %{{.*}} = %[[LB3_CVT]] to %[[UB3_CVT]] step %{{.*}} unordered + + do concurrent(i=1:n, j=1:bar(n*m, n/m), k=5:10) + a(i) = n + end do +end subroutine + +!CHECK-LABEL: sub2 +subroutine sub2(n) + implicit none + integer :: n, m, i, j + integer, dimension(n) :: a +!CHECK: %[[LB1:.*]] = arith.constant 1 : i32 +!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index +!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref +!CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index +!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered +!CHECK: %[[LB2:.*]] = arith.constant 1 : i32 +!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index +!CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs fastmath : (!fir.ref, !fir.ref) -> i32 +!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index +!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered + do concurrent(i=1:n) + do concurrent(j=1:bar(n*m, n/m)) + a(i) = n + end do + end do +end subroutine + + +!CHECK-LABEL: unstructured +subroutine unstructured(inner_step) + integer(4) :: i, j, inner_step + +!CHECK-NOT: cf.br +!CHECK-NOT: cf.cond_br +!CHECK: %[[LB1:.*]] = arith.constant 1 : i32 +!CHECK: %[[LB1_CVT:.*]] = fir.convert %c1_i32 : (i32) -> i16 +!CHECK: %[[UB1:.*]] = arith.constant 5 : i32 +!CHECK: %[[UB1_CVT:.*]] = fir.convert %c5_i32 : (i32) -> i16 +!CHECK: %[[STP1:.*]] = arith.constant 1 : i16 + +!CHECK-NOT: cf.br +!CHECK-NOT: cf.cond_br +!CHECK: %[[LB2:.*]] = arith.constant 3 : i32 +!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> i16 +!CHECK: %[[UB2:.*]] = arith.constant 9 : i32 +!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> i16 +!CHECK: %[[STP2:.*]] = fir.load %{{.*}}#0 : !fir.ref +!CHECK: %[[STP2_CVT:.*]] = fir.convert %[[STP2]] : (i32) -> i16 +!CHECK: fir.store %[[STP2_CVT]] to %{{.*}} : !fir.ref +!CHECK: cf.br ^[[I_LOOP_HEADER:.*]] + +!CHECK: ^[[I_LOOP_HEADER]]: +!CHECK-NEXT: %{{.*}} = fir.load %{{.*}} : !fir.ref +!CHECK-NEXT: %{{.*}} = arith.constant 0 : i16 +!CHECK-NEXT: %{{.*}} = arith.cmpi sgt, %{{.*}}, %{{.*}}: i16 +!CHECK-NEXT: cf.cond_br %{{.*}}, ^[[J_LOOP_HEADER:.*]], ^{{.*}} + +!CHECK: ^[[J_LOOP_HEADER]]: +!CHECK-NEXT: %[[RANGE:.*]] = arith.subi %[[UB2_CVT]], %[[LB2_CVT]] : i16 +!CHECK-NEXT: %{{.*}} = arith.addi %[[RANGE]], %[[STP2_CVT]] : i16 +!CHECK-NEXT: %{{.*}} = arith.divsi %{{.*}}, %[[STP2_CVT]] : i16 + do concurrent (integer(2)::i=1:5, j=3:9:inner_step, i.ne.3) + goto (7, 7) i+1 + print*, 'E:', i, j + 7 continue + enddo +end subroutine unstructured diff --git a/flang/tools/CMakeLists.txt b/flang/tools/CMakeLists.txt index 337545ae0d4d7..1d2d2c608faf9 100644 --- a/flang/tools/CMakeLists.txt +++ b/flang/tools/CMakeLists.txt @@ -12,3 +12,4 @@ add_subdirectory(flang-driver) add_subdirectory(tco) add_subdirectory(f18-parse-demo) add_subdirectory(fir-opt) +add_subdirectory(fir-lsp-server) diff --git a/flang/tools/fir-lsp-server/CMakeLists.txt b/flang/tools/fir-lsp-server/CMakeLists.txt new file mode 100644 index 0000000000000..ff0ced6693b97 --- /dev/null +++ b/flang/tools/fir-lsp-server/CMakeLists.txt @@ -0,0 +1,17 @@ +set(LLVM_LINK_COMPONENTS + Core + Support + AsmParser + ) + +add_flang_tool(fir-lsp-server fir-lsp-server.cpp) + +get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) +get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS) +target_link_libraries(fir-lsp-server PRIVATE + CUFDialect + FIRDialect + HLFIRDialect + MLIRLspServerLib + ${dialect_libs} + ${extension_libs}) diff --git a/flang/tools/fir-lsp-server/fir-lsp-server.cpp b/flang/tools/fir-lsp-server/fir-lsp-server.cpp new file mode 100644 index 0000000000000..8b724e292b5ab --- /dev/null +++ b/flang/tools/fir-lsp-server/fir-lsp-server.cpp @@ -0,0 +1,9 @@ +#include "mlir/Tools/mlir-lsp-server/MlirLspServerMain.h" +#include "flang/Optimizer/Support/InitFIR.h" + +int main(int argc, char **argv) { + mlir::DialectRegistry registry; + fir::support::registerNonCodegenDialects(registry); + fir::support::addFIRExtensions(registry); + return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry)); +} diff --git a/libc/hdr/fenv_macros.h b/libc/hdr/fenv_macros.h index a2e4462ef02dc..3f0bd89a6ea35 100644 --- a/libc/hdr/fenv_macros.h +++ b/libc/hdr/fenv_macros.h @@ -19,7 +19,6 @@ // In some environment, FE_ALL_EXCEPT is set to 0 and the remaining exceptions // FE_* are missing. -#if (FE_ALL_EXCEPT == 0) #ifndef FE_DIVBYZERO #define FE_DIVBYZERO 0 #endif // FE_DIVBYZERO @@ -39,12 +38,6 @@ #ifndef FE_UNDERFLOW #define FE_UNDERFLOW 0 #endif // FE_UNDERFLOW -#else -// If this is not provided by the system, define it for use internally. -#ifndef __FE_DENORM -#define __FE_DENORM (1 << 6) -#endif -#endif // Rounding mode macros might be missing. #ifndef FE_DOWNWARD diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index e45979857d795..5ad71e7a6ff46 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -51,7 +51,7 @@ add_proxy_header_library( HDRS mode_t.h DEPENDS - ../fcntl_overlay + libc.hdr.fcntl_overlay FULL_BUILD_DEPENDS libc.include.llvm-libc-types.mode_t libc.include.fcntl diff --git a/libc/include/llvm-libc-macros/linux/fcntl-macros.h b/libc/include/llvm-libc-macros/linux/fcntl-macros.h index 8ee95863728e1..aec8a0d2da0b5 100644 --- a/libc/include/llvm-libc-macros/linux/fcntl-macros.h +++ b/libc/include/llvm-libc-macros/linux/fcntl-macros.h @@ -88,6 +88,9 @@ // Close on succesful #define F_CLOEXEC 1 +// Close on execute for fcntl. +#define FD_CLOEXEC 1 + #define F_RDLCK 0 #define F_WRLCK 1 #define F_UNLCK 2 diff --git a/libc/include/llvm-libc-macros/linux/signal-macros.h b/libc/include/llvm-libc-macros/linux/signal-macros.h index e379fc41efd02..0b7317ebc9b80 100644 --- a/libc/include/llvm-libc-macros/linux/signal-macros.h +++ b/libc/include/llvm-libc-macros/linux/signal-macros.h @@ -76,15 +76,12 @@ #define SS_ONSTACK 0x1 #define SS_DISABLE 0x2 -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(__i386__) || defined(__riscv) #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 #elif defined(__aarch64__) #define MINSIGSTKSZ 5120 #define SIGSTKSZ 16384 -#elif defined(__riscv) -#define MINSIGSTKSZ 2048 -#define SIGSTKSZ 8192 #else #error "Signal stack sizes not defined for your platform." #endif diff --git a/libc/include/llvm-libc-types/fexcept_t.h b/libc/include/llvm-libc-types/fexcept_t.h index 60687bd1318aa..5aa09fbbaffc7 100644 --- a/libc/include/llvm-libc-types/fexcept_t.h +++ b/libc/include/llvm-libc-types/fexcept_t.h @@ -9,6 +9,10 @@ #ifndef LLVM_LIBC_TYPES_FEXCEPT_T_H #define LLVM_LIBC_TYPES_FEXCEPT_T_H -typedef int fexcept_t; +#if defined(__x86_64__) || defined(__i386__) +typedef unsigned short int fexcept_t; +#else +typedef unsigned int fexcept_t; +#endif #endif // LLVM_LIBC_TYPES_FEXCEPT_T_H diff --git a/libc/include/llvm-libc-types/jmp_buf.h b/libc/include/llvm-libc-types/jmp_buf.h index 60e033c6c65a9..f246e6491cf55 100644 --- a/libc/include/llvm-libc-types/jmp_buf.h +++ b/libc/include/llvm-libc-types/jmp_buf.h @@ -19,6 +19,13 @@ typedef struct { __UINT64_TYPE__ r15; __UINTPTR_TYPE__ rsp; __UINTPTR_TYPE__ rip; +#elif defined(__i386__) + long ebx; + long esi; + long edi; + long ebp; + long esp; + long eip; #elif defined(__riscv) /* Program counter. */ long int __pc; diff --git a/libc/src/__support/OSUtil/linux/i386/syscall.h b/libc/src/__support/OSUtil/linux/i386/syscall.h new file mode 100644 index 0000000000000..88d7f2fb2c49f --- /dev/null +++ b/libc/src/__support/OSUtil/linux/i386/syscall.h @@ -0,0 +1,88 @@ +//===---------- inline implementation of i386 syscalls ------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_I386_SYSCALL_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_I386_SYSCALL_H + +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LIBC_INLINE long syscall_impl(long num) { + long ret; + LIBC_INLINE_ASM("int $128" : "=a"(ret) : "a"(num) : "memory"); + return ret; +} + +LIBC_INLINE long syscall_impl(long num, long arg1) { + long ret; + LIBC_INLINE_ASM("int $128" : "=a"(ret) : "a"(num), "b"(arg1) : "memory"); + return ret; +} + +LIBC_INLINE long syscall_impl(long num, long arg1, long arg2) { + long ret; + LIBC_INLINE_ASM("int $128" + : "=a"(ret) + : "a"(num), "b"(arg1), "c"(arg2) + : "memory"); + return ret; +} + +LIBC_INLINE long syscall_impl(long num, long arg1, long arg2, long arg3) { + long ret; + LIBC_INLINE_ASM("int $128" + : "=a"(ret) + : "a"(num), "b"(arg1), "c"(arg2), "d"(arg3) + : "memory"); + return ret; +} + +LIBC_INLINE long syscall_impl(long num, long arg1, long arg2, long arg3, + long arg4) { + long ret; + LIBC_INLINE_ASM("int $128" + : "=a"(ret) + : "a"(num), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) + : "memory"); + return ret; +} + +LIBC_INLINE long syscall_impl(long num, long arg1, long arg2, long arg3, + long arg4, long arg5) { + long ret; + LIBC_INLINE_ASM("int $128" + : "=a"(ret) + : "a"(num), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4), + "D"(arg5) + : "memory"); + return ret; +} + +LIBC_INLINE long syscall_impl(long num, long arg1, long arg2, long arg3, + long arg4, long arg5, long arg6) { + long ret; + LIBC_INLINE_ASM(R"( + push %[arg6] + push %%ebp + mov 4(%%esp), %%ebp + int $128 + pop %%ebp + add $4, %%esp + )" + : "=a"(ret) + : "a"(num), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4), + "D"(arg5), [arg6] "m"(arg6) + : "memory"); + return ret; +} + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_I386_SYSCALL_H diff --git a/libc/src/__support/OSUtil/linux/syscall.h b/libc/src/__support/OSUtil/linux/syscall.h index ad3f6947d0a06..24e0fca73c167 100644 --- a/libc/src/__support/OSUtil/linux/syscall.h +++ b/libc/src/__support/OSUtil/linux/syscall.h @@ -14,7 +14,9 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#ifdef LIBC_TARGET_ARCH_IS_X86_64 +#ifdef LIBC_TARGET_ARCH_IS_X86_32 +#include "i386/syscall.h" +#elif defined(LIBC_TARGET_ARCH_IS_X86_64) #include "x86_64/syscall.h" #elif defined(LIBC_TARGET_ARCH_IS_AARCH64) #include "aarch64/syscall.h" diff --git a/libc/src/setjmp/x86_64/longjmp.cpp b/libc/src/setjmp/x86_64/longjmp.cpp index c293c55a6f9fb..143c9deb11e9a 100644 --- a/libc/src/setjmp/x86_64/longjmp.cpp +++ b/libc/src/setjmp/x86_64/longjmp.cpp @@ -11,12 +11,34 @@ #include "src/__support/common.h" #include "src/__support/macros/config.h" -#if !defined(LIBC_TARGET_ARCH_IS_X86_64) +#if !defined(LIBC_TARGET_ARCH_IS_X86) #error "Invalid file include" #endif namespace LIBC_NAMESPACE_DECL { +#ifdef __i386__ +[[gnu::naked]] +LLVM_LIBC_FUNCTION(void, longjmp, (jmp_buf, int)) { + asm(R"( + mov 0x4(%%esp), %%ecx + mov 0x8(%%esp), %%eax + cmpl $0x1, %%eax + adcl $0x0, %%eax + + mov %c[ebx](%%ecx), %%ebx + mov %c[esi](%%ecx), %%esi + mov %c[edi](%%ecx), %%edi + mov %c[ebp](%%ecx), %%ebp + mov %c[esp](%%ecx), %%esp + + jmp *%c[eip](%%ecx) + )" ::[ebx] "i"(offsetof(__jmp_buf, ebx)), + [esi] "i"(offsetof(__jmp_buf, esi)), [edi] "i"(offsetof(__jmp_buf, edi)), + [ebp] "i"(offsetof(__jmp_buf, ebp)), [esp] "i"(offsetof(__jmp_buf, esp)), + [eip] "i"(offsetof(__jmp_buf, eip))); +} +#else [[gnu::naked]] LLVM_LIBC_FUNCTION(void, longjmp, (jmp_buf, int)) { asm(R"( @@ -38,5 +60,6 @@ LLVM_LIBC_FUNCTION(void, longjmp, (jmp_buf, int)) { [r15] "i"(offsetof(__jmp_buf, r15)), [rsp] "i"(offsetof(__jmp_buf, rsp)), [rip] "i"(offsetof(__jmp_buf, rip))); } +#endif } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/setjmp/x86_64/setjmp.cpp b/libc/src/setjmp/x86_64/setjmp.cpp index f6e82642edd7d..5ac10fa87b39a 100644 --- a/libc/src/setjmp/x86_64/setjmp.cpp +++ b/libc/src/setjmp/x86_64/setjmp.cpp @@ -11,12 +11,37 @@ #include "src/__support/macros/config.h" #include "src/setjmp/setjmp_impl.h" -#if !defined(LIBC_TARGET_ARCH_IS_X86_64) +#if !defined(LIBC_TARGET_ARCH_IS_X86) #error "Invalid file include" #endif namespace LIBC_NAMESPACE_DECL { +#ifdef __i386__ +[[gnu::naked]] +LLVM_LIBC_FUNCTION(int, setjmp, (jmp_buf buf)) { + asm(R"( + mov 4(%%esp), %%eax + + mov %%ebx, %c[ebx](%%eax) + mov %%esi, %c[esi](%%eax) + mov %%edi, %c[edi](%%eax) + mov %%ebp, %c[ebp](%%eax) + + lea 4(%%esp), %%ecx + mov %%ecx, %c[esp](%%eax) + + mov (%%esp), %%ecx + mov %%ecx, %c[eip](%%eax) + + xorl %%eax, %%eax + retl)" ::[ebx] "i"(offsetof(__jmp_buf, ebx)), + [esi] "i"(offsetof(__jmp_buf, esi)), [edi] "i"(offsetof(__jmp_buf, edi)), + [ebp] "i"(offsetof(__jmp_buf, ebp)), [esp] "i"(offsetof(__jmp_buf, esp)), + [eip] "i"(offsetof(__jmp_buf, eip)) + : "eax", "ecx"); +} +#else [[gnu::naked]] LLVM_LIBC_FUNCTION(int, setjmp, (jmp_buf buf)) { asm(R"( @@ -41,5 +66,6 @@ LLVM_LIBC_FUNCTION(int, setjmp, (jmp_buf buf)) { [rip] "i"(offsetof(__jmp_buf, rip)) : "rax"); } +#endif } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h index 78381e46e480d..22a1876da5369 100644 --- a/libc/src/string/string_utils.h +++ b/libc/src/string/string_utils.h @@ -221,7 +221,7 @@ LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src, return len; size_t n = len < size - 1 ? len : size - 1; inline_memcpy(dst, src, n); - inline_bzero(dst + n, size - n); + dst[n] = '\0'; return len; } @@ -239,11 +239,13 @@ LIBC_INLINE constexpr static char *strrchr_implementation(const char *src, int c) { char ch = static_cast(c); char *last_occurrence = nullptr; - for (; *src; ++src) { + while (true) { if (*src == ch) last_occurrence = const_cast(src); + if (!*src) + return last_occurrence; + ++src; } - return last_occurrence; } } // namespace internal diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h index 7fcc6a32025b5..55fe73cd2f1ac 100644 --- a/libc/test/UnitTest/FPMatcher.h +++ b/libc/test/UnitTest/FPMatcher.h @@ -297,31 +297,35 @@ struct ModifyMXCSR { #define EXPECT_FP_EXCEPTION(expected) \ do { \ if (math_errhandling & MATH_ERREXCEPT) { \ - EXPECT_EQ(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) & \ - ((expected) ? (expected) : FE_ALL_EXCEPT), \ - (expected)); \ + EXPECT_EQ( \ + LIBC_NAMESPACE::fputil::test_except( \ + static_cast(FE_ALL_EXCEPT)) & \ + ((expected) ? (expected) : static_cast(FE_ALL_EXCEPT)), \ + (expected)); \ } \ } while (0) #define ASSERT_FP_EXCEPTION(expected) \ do { \ if (math_errhandling & MATH_ERREXCEPT) { \ - ASSERT_EQ(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) & \ - ((expected) ? (expected) : FE_ALL_EXCEPT), \ - (expected)); \ + ASSERT_EQ( \ + LIBC_NAMESPACE::fputil::test_except( \ + static_cast(FE_ALL_EXCEPT)) & \ + ((expected) ? (expected) : static_cast(FE_ALL_EXCEPT)), \ + (expected)); \ } \ } while (0) #define EXPECT_FP_EQ_WITH_EXCEPTION(expected_val, actual_val, expected_except) \ do { \ - LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); \ + LIBC_NAMESPACE::fputil::clear_except(static_cast(FE_ALL_EXCEPT)); \ EXPECT_FP_EQ(expected_val, actual_val); \ EXPECT_FP_EXCEPTION(expected_except); \ } while (0) #define EXPECT_FP_IS_NAN_WITH_EXCEPTION(actual_val, expected_except) \ do { \ - LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); \ + LIBC_NAMESPACE::fputil::clear_except(static_cast(FE_ALL_EXCEPT)); \ EXPECT_FP_IS_NAN(actual_val); \ EXPECT_FP_EXCEPTION(expected_except); \ } while (0) @@ -374,7 +378,7 @@ struct ModifyMXCSR { using namespace LIBC_NAMESPACE::fputil::testing; \ ForceRoundingMode __r((rounding_mode)); \ if (__r.success) { \ - LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); \ + LIBC_NAMESPACE::fputil::clear_except(static_cast(FE_ALL_EXCEPT)); \ EXPECT_FP_EQ((expected), (actual)); \ EXPECT_FP_EXCEPTION(expected_except); \ } \ diff --git a/libc/test/UnitTest/LibcTest.h b/libc/test/UnitTest/LibcTest.h index 2b972004e9eea..b4e3819ea958d 100644 --- a/libc/test/UnitTest/LibcTest.h +++ b/libc/test/UnitTest/LibcTest.h @@ -162,6 +162,14 @@ class Test { (unsigned long long)RHS, LHSStr, RHSStr, Loc); } + // Helper to allow macro invocations like `ASSERT_EQ(foo, nullptr)`. + template , ValType> = nullptr> + bool test(TestCond Cond, ValType LHS, cpp::nullptr_t, const char *LHSStr, + const char *RHSStr, internal::Location Loc) { + return test(Cond, LHS, static_cast(nullptr), LHSStr, RHSStr, Loc); + } + template < typename ValType, cpp::enable_if_t< diff --git a/libc/test/src/search/hsearch_test.cpp b/libc/test/src/search/hsearch_test.cpp index f7d94791f2bc0..01805d8f5b4da 100644 --- a/libc/test/src/search/hsearch_test.cpp +++ b/libc/test/src/search/hsearch_test.cpp @@ -15,7 +15,6 @@ #include "src/search/hsearch.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -#include TEST(LlvmLibcHsearchTest, CreateTooLarge) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; diff --git a/libc/test/src/string/StrchrTest.h b/libc/test/src/string/StrchrTest.h index 74e172de95953..8c3fe5293008a 100644 --- a/libc/test/src/string/StrchrTest.h +++ b/libc/test/src/string/StrchrTest.h @@ -40,14 +40,16 @@ template struct StrchrTest : public LIBC_NAMESPACE::testing::Test { const char *src = "abcde"; // Should return null terminator. - ASSERT_STREQ(Func(src, '\0'), ""); + const char *nul_terminator = Func(src, '\0'); + ASSERT_NE(nul_terminator, nullptr); + ASSERT_STREQ(nul_terminator, ""); // Source string should not change. ASSERT_STREQ(src, "abcde"); } void characterNotWithinStringShouldReturnNullptr() { // Since 'z' is not within the string, should return nullptr. - ASSERT_STREQ(Func("123?", 'z'), nullptr); + ASSERT_EQ(Func("123?", 'z'), nullptr); } void theSourceShouldNotChange() { @@ -74,11 +76,13 @@ template struct StrchrTest : public LIBC_NAMESPACE::testing::Test { void emptyStringShouldOnlyMatchNullTerminator() { // Null terminator should match. - ASSERT_STREQ(Func("", '\0'), ""); + const char empty_string[] = ""; + ASSERT_EQ(static_cast(Func(empty_string, '\0')), + empty_string); // All other characters should not match. - ASSERT_STREQ(Func("", 'Z'), nullptr); - ASSERT_STREQ(Func("", '3'), nullptr); - ASSERT_STREQ(Func("", '*'), nullptr); + ASSERT_EQ(Func("", 'Z'), nullptr); + ASSERT_EQ(Func("", '3'), nullptr); + ASSERT_EQ(Func("", '*'), nullptr); } }; @@ -114,7 +118,9 @@ template struct StrrchrTest : public LIBC_NAMESPACE::testing::Test { const char *src = "abcde"; // Should return null terminator. - ASSERT_STREQ(Func(src, '\0'), ""); + const char *nul_terminator = Func(src, '\0'); + ASSERT_NE(nul_terminator, nullptr); + ASSERT_STREQ(nul_terminator, ""); // Source string should not change. ASSERT_STREQ(src, "abcde"); } @@ -122,9 +128,9 @@ template struct StrrchrTest : public LIBC_NAMESPACE::testing::Test { void findsLastBehindFirstNullTerminator() { static const char src[6] = {'a', 'a', '\0', 'b', '\0', 'c'}; // 'b' is behind a null terminator, so should not be found. - ASSERT_STREQ(Func(src, 'b'), nullptr); + ASSERT_EQ(Func(src, 'b'), nullptr); // Same goes for 'c'. - ASSERT_STREQ(Func(src, 'c'), nullptr); + ASSERT_EQ(Func(src, 'c'), nullptr); // Should find the second of the two a's. ASSERT_STREQ(Func(src, 'a'), "a"); @@ -132,7 +138,7 @@ template struct StrrchrTest : public LIBC_NAMESPACE::testing::Test { void characterNotWithinStringShouldReturnNullptr() { // Since 'z' is not within the string, should return nullptr. - ASSERT_STREQ(Func("123?", 'z'), nullptr); + ASSERT_EQ(Func("123?", 'z'), nullptr); } void shouldFindLastOfDuplicates() { @@ -146,11 +152,13 @@ template struct StrrchrTest : public LIBC_NAMESPACE::testing::Test { void emptyStringShouldOnlyMatchNullTerminator() { // Null terminator should match. - ASSERT_STREQ(Func("", '\0'), ""); + const char empty_string[] = ""; + ASSERT_EQ(static_cast(Func(empty_string, '\0')), + empty_string); // All other characters should not match. - ASSERT_STREQ(Func("", 'A'), nullptr); - ASSERT_STREQ(Func("", '2'), nullptr); - ASSERT_STREQ(Func("", '*'), nullptr); + ASSERT_EQ(Func("", 'A'), nullptr); + ASSERT_EQ(Func("", '2'), nullptr); + ASSERT_EQ(Func("", '*'), nullptr); } }; diff --git a/libc/test/src/string/strlcat_test.cpp b/libc/test/src/string/strlcat_test.cpp index 1ffa4b0e921e2..5757fc92b39d2 100644 --- a/libc/test/src/string/strlcat_test.cpp +++ b/libc/test/src/string/strlcat_test.cpp @@ -27,6 +27,15 @@ TEST(LlvmLibcStrlcatTest, Smaller) { EXPECT_STREQ(buf, "abcd"); } +TEST(LlvmLibcStrlcatTest, SmallerNoOverwriteAfter0) { + const char *str = "cd"; + char buf[8]{"ab\0\0efg"}; + + EXPECT_EQ(LIBC_NAMESPACE::strlcat(buf, str, 8), size_t(4)); + EXPECT_STREQ(buf, "abcd"); + EXPECT_STREQ(buf + 5, "fg"); +} + TEST(LlvmLibcStrlcatTest, No0) { const char *str = "cd"; char buf[7]{"ab"}; diff --git a/libc/test/src/string/strlcpy_test.cpp b/libc/test/src/string/strlcpy_test.cpp index 5a1e30c12963f..ecf0e925a265c 100644 --- a/libc/test/src/string/strlcpy_test.cpp +++ b/libc/test/src/string/strlcpy_test.cpp @@ -25,6 +25,5 @@ TEST(LlvmLibcStrlcpyTest, Smaller) { EXPECT_EQ(LIBC_NAMESPACE::strlcpy(buf, str, 7), size_t(3)); EXPECT_STREQ(buf, "abc"); - for (const char *p = buf + 3; p < buf + 7; p++) - EXPECT_EQ(*p, '\0'); + EXPECT_STREQ(buf + 4, "11"); } diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt index a432d88ffb90c..69263986cc574 100644 --- a/libc/test/src/sys/mman/linux/CMakeLists.txt +++ b/libc/test/src/sys/mman/linux/CMakeLists.txt @@ -163,5 +163,6 @@ add_libc_unittest( libc.src.unistd.ftruncate libc.src.unistd.close libc.src.__support.OSUtil.osutil + libc.hdr.fcntl_macros libc.test.UnitTest.ErrnoSetterMatcher ) diff --git a/libc/test/src/sys/mman/linux/mlock_test.cpp b/libc/test/src/sys/mman/linux/mlock_test.cpp index 804038a68a7e0..c6e459276a8fb 100644 --- a/libc/test/src/sys/mman/linux/mlock_test.cpp +++ b/libc/test/src/sys/mman/linux/mlock_test.cpp @@ -23,8 +23,6 @@ #include "test/UnitTest/LibcTest.h" #include "test/UnitTest/Test.h" -#include -#include #include #include #include diff --git a/libc/test/src/sys/mman/linux/shm_test.cpp b/libc/test/src/sys/mman/linux/shm_test.cpp index 4b8971f670581..97de705c4c2b0 100644 --- a/libc/test/src/sys/mman/linux/shm_test.cpp +++ b/libc/test/src/sys/mman/linux/shm_test.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "hdr/fcntl_macros.h" #include "src/__support/OSUtil/syscall.h" #include "src/fcntl/fcntl.h" #include "src/sys/mman/mmap.h" @@ -16,7 +17,6 @@ #include "src/unistd/ftruncate.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/LibcTest.h" -#include #include using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher; diff --git a/libc/test/src/sys/statvfs/linux/CMakeLists.txt b/libc/test/src/sys/statvfs/linux/CMakeLists.txt index 1f8688868e043..fa1e9052d1cac 100644 --- a/libc/test/src/sys/statvfs/linux/CMakeLists.txt +++ b/libc/test/src/sys/statvfs/linux/CMakeLists.txt @@ -8,8 +8,9 @@ add_libc_unittest( statvfs_test.cpp DEPENDS libc.src.errno.errno - libc.src.sys.statvfs.linux.statfs_utils libc.src.sys.statvfs.statvfs + libc.src.sys.stat.mkdirat + libc.src.sys.stat.rmdir libc.test.UnitTest.ErrnoSetterMatcher ) @@ -21,8 +22,9 @@ add_libc_unittest( fstatvfs_test.cpp DEPENDS libc.src.errno.errno - libc.src.sys.statvfs.linux.statfs_utils libc.src.sys.statvfs.fstatvfs + libc.src.sys.stat.mkdirat + libc.src.sys.stat.rmdir libc.src.fcntl.open libc.src.unistd.close libc.test.UnitTest.ErrnoSetterMatcher diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp index 2f3e0b96ff095..efd1e688280b5 100644 --- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp +++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp @@ -1,49 +1,56 @@ +//===-- Unittests for fstatvfs --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include "hdr/fcntl_macros.h" #include "src/__support/macros/config.h" #include "src/fcntl/open.h" +#include "src/sys/stat/mkdirat.h" #include "src/sys/statvfs/fstatvfs.h" -#include "src/sys/statvfs/linux/statfs_utils.h" #include "src/unistd/close.h" +#include "src/unistd/rmdir.h" #include "test/UnitTest/ErrnoSetterMatcher.h" -#include "test/UnitTest/LibcTest.h" -#include +#include "test/UnitTest/Test.h" + using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher; -#ifdef SYS_statfs64 -using StatFs = statfs64; -#else -using StatFs = statfs; -#endif - -namespace LIBC_NAMESPACE_DECL { -static int fstatfs(int fd, StatFs *buf) { - using namespace statfs_utils; - if (cpp::optional result = linux_fstatfs(fd)) { - *buf = *result; - return 0; - } - return -1; -} -} // namespace LIBC_NAMESPACE_DECL - -struct PathFD { - int fd; - explicit PathFD(const char *path) - : fd(LIBC_NAMESPACE::open(path, O_CLOEXEC | O_PATH)) {} - ~PathFD() { LIBC_NAMESPACE::close(fd); } - operator int() const { return fd; } -}; - -TEST(LlvmLibcSysStatvfsTest, FstatfsBasic) { - StatFs buf; - ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/"), &buf), Succeeds()); - ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/proc"), &buf), Succeeds()); - ASSERT_EQ(buf.f_type, static_cast(PROC_SUPER_MAGIC)); - ASSERT_THAT(LIBC_NAMESPACE::fstatfs(PathFD("/sys"), &buf), Succeeds()); - ASSERT_EQ(buf.f_type, static_cast(SYSFS_MAGIC)); +TEST(LlvmLibcSysFStatvfsTest, FStatvfsBasic) { + struct statvfs buf; + + int fd = LIBC_NAMESPACE::open("/", O_PATH); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + // The root of the file directory must always exist + ASSERT_THAT(LIBC_NAMESPACE::fstatvfs(fd, &buf), Succeeds()); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); } -TEST(LlvmLibcSysStatvfsTest, FstatvfsInvalidFD) { +TEST(LlvmLibcSysFStatvfsTest, FStatvfsInvalidPath) { struct statvfs buf; - ASSERT_THAT(LIBC_NAMESPACE::fstatvfs(-1, &buf), Fails(EBADF)); + + constexpr const char *FILENAME = "testdata/statvfs.testdir"; + auto TEST_DIR = libc_make_test_file_path(FILENAME); + + ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU), + Succeeds(0)); + + int fd = LIBC_NAMESPACE::open(TEST_DIR, O_PATH); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + // create the file, assert it exists, then delete it and assert it doesn't + // exist anymore. + + ASSERT_THAT(LIBC_NAMESPACE::fstatvfs(fd, &buf), Succeeds()); + + ASSERT_THAT(LIBC_NAMESPACE::rmdir(TEST_DIR), Succeeds(0)); + + ASSERT_THAT(LIBC_NAMESPACE::fstatvfs(fd, &buf), Fails(ENOENT)); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + ASSERT_THAT(LIBC_NAMESPACE::fstatvfs(fd, &buf), Fails(ENOENT)); } diff --git a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp index 5329adb54d64d..0b154e7aa3fb7 100644 --- a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp +++ b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp @@ -1,54 +1,43 @@ +//===-- Unittests for statvfs ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fcntl_macros.h" #include "src/__support/macros/config.h" -#include "src/sys/statvfs/linux/statfs_utils.h" +#include "src/sys/stat/mkdirat.h" #include "src/sys/statvfs/statvfs.h" +#include "src/unistd/rmdir.h" #include "test/UnitTest/ErrnoSetterMatcher.h" -#include "test/UnitTest/LibcTest.h" -#include +#include "test/UnitTest/Test.h" + using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher; -#ifdef SYS_statfs64 -using StatFs = statfs64; -#else -using StatFs = statfs; -#endif - -namespace LIBC_NAMESPACE_DECL { -static int statfs(const char *path, StatFs *buf) { - using namespace statfs_utils; - if (cpp::optional result = linux_statfs(path)) { - *buf = *result; - return 0; - } - return -1; -} -} // namespace LIBC_NAMESPACE_DECL - -TEST(LlvmLibcSysStatfsTest, StatfsBasic) { - StatFs buf; - ASSERT_THAT(LIBC_NAMESPACE::statfs("/", &buf), Succeeds()); - ASSERT_THAT(LIBC_NAMESPACE::statfs("/proc", &buf), Succeeds()); - ASSERT_EQ(buf.f_type, static_cast(PROC_SUPER_MAGIC)); - ASSERT_THAT(LIBC_NAMESPACE::statfs("/sys", &buf), Succeeds()); - ASSERT_EQ(buf.f_type, static_cast(SYSFS_MAGIC)); +TEST(LlvmLibcSysStatvfsTest, StatvfsBasic) { + struct statvfs buf; + // The root of the file directory must always exist + ASSERT_THAT(LIBC_NAMESPACE::statvfs("/", &buf), Succeeds()); } -TEST(LlvmLibcSysStatfsTest, StatvfsInvalidPath) { +TEST(LlvmLibcSysStatvfsTest, StatvfsInvalidPath) { struct statvfs buf; + ASSERT_THAT(LIBC_NAMESPACE::statvfs("", &buf), Fails(ENOENT)); - ASSERT_THAT(LIBC_NAMESPACE::statvfs("/nonexistent", &buf), Fails(ENOENT)); - ASSERT_THAT(LIBC_NAMESPACE::statvfs("/dev/null/whatever", &buf), - Fails(ENOTDIR)); - ASSERT_THAT(LIBC_NAMESPACE::statvfs(nullptr, &buf), Fails(EFAULT)); -} -TEST(LlvmLibcSysStatfsTest, StatvfsNameTooLong) { - struct statvfs buf; - ASSERT_THAT(LIBC_NAMESPACE::statvfs("/", &buf), Succeeds()); - char *name = static_cast(__builtin_alloca(buf.f_namemax + 3)); - name[0] = '/'; - name[buf.f_namemax + 2] = '\0'; - for (unsigned i = 1; i < buf.f_namemax + 2; ++i) { - name[i] = 'a'; - } - ASSERT_THAT(LIBC_NAMESPACE::statvfs(name, &buf), Fails(ENAMETOOLONG)); + // create the file, assert it exists, then delete it and assert it doesn't + // exist anymore. + constexpr const char *FILENAME = "testdata/statvfs.testdir"; + auto TEST_DIR = libc_make_test_file_path(FILENAME); + + ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU), + Succeeds(0)); + + ASSERT_THAT(LIBC_NAMESPACE::statvfs(TEST_DIR, &buf), Succeeds()); + + ASSERT_THAT(LIBC_NAMESPACE::rmdir(TEST_DIR), Succeeds(0)); + + ASSERT_THAT(LIBC_NAMESPACE::statvfs(TEST_DIR, &buf), Fails(ENOENT)); } diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp index 60e4abadb5e3c..5afc3d007d4d7 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.cpp +++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp @@ -262,6 +262,7 @@ class MPFRNumber { int d = mpz_tstbit(integer, 0); mpfr_set_si(result.value, d ? -1 : 1, mpfr_rounding); + mpz_clear(integer); return result; } diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 16d74e53295cc..2c2c7f16e2944 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -321,21 +321,30 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) message( STATUS " device: ${d} ( ${${d}_aliases} )" ) if ( ARCH STREQUAL spirv OR ARCH STREQUAL spirv64 ) - set( build_flags -O0 -finline-hint-functions ) + set( build_flags -O0 -finline-hint-functions -DCLC_SPIRV ) set( opt_flags ) set( spvflags --spirv-max-version=1.1 ) + set( MACRO_ARCH SPIRV32 ) + if( ARCH STREQUAL spirv64 ) + set( MACRO_ARCH SPIRV64 ) + endif() elseif( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) - set( build_flags "-Wno-unknown-assumption") + set( build_flags "-Wno-unknown-assumption" -DCLC_CLSPV ) set( opt_flags -O3 ) + set( MACRO_ARCH CLSPV32 ) + if( ARCH STREQUAL clspv64 ) + set( MACRO_ARCH CLSPV64 ) + endif() else() set( build_flags ) set( opt_flags -O3 ) + set( MACRO_ARCH ${ARCH} ) endif() set( LIBCLC_ARCH_OBJFILE_DIR "${LIBCLC_OBJFILE_DIR}/${arch_suffix}" ) file( MAKE_DIRECTORY ${LIBCLC_ARCH_OBJFILE_DIR} ) - string( TOUPPER "CLC_${ARCH}" CLC_TARGET_DEFINE ) + string( TOUPPER "CLC_${MACRO_ARCH}" CLC_TARGET_DEFINE ) list( APPEND build_flags -D__CLC_INTERNAL diff --git a/libclc/clc/include/clc/clcfunc.h b/libclc/clc/include/clc/clcfunc.h index fe3406f64fecb..4698f0950d8a8 100644 --- a/libclc/clc/include/clc/clcfunc.h +++ b/libclc/clc/include/clc/clcfunc.h @@ -7,9 +7,9 @@ // avoid inlines for SPIR-V related targets since we'll optimise later in the // chain -#if defined(CLC_SPIRV) || defined(CLC_SPIRV64) +#if defined(CLC_SPIRV) #define _CLC_DEF -#elif defined(CLC_CLSPV) || defined(CLC_CLSPV64) +#elif defined(CLC_CLSPV) #define _CLC_DEF __attribute__((noinline)) __attribute__((clspv_libclc_builtin)) #else #define _CLC_DEF __attribute__((always_inline)) diff --git a/libclc/generic/include/clc/integer/gentype.inc b/libclc/clc/include/clc/integer/gentype.inc similarity index 99% rename from libclc/generic/include/clc/integer/gentype.inc rename to libclc/clc/include/clc/integer/gentype.inc index cefed9c5e51ee..2c8dd143db879 100644 --- a/libclc/generic/include/clc/integer/gentype.inc +++ b/libclc/clc/include/clc/integer/gentype.inc @@ -1,5 +1,5 @@ -//These 2 defines only change when switching between data sizes or base types to -//keep this file manageable. +// These 2 defines only change when switching between data sizes or base types +// to keep this file manageable. #define __CLC_GENSIZE 8 #define __CLC_SCALAR_GENTYPE char diff --git a/libclc/generic/include/clc/math/gentype.inc b/libclc/clc/include/clc/math/gentype.inc similarity index 100% rename from libclc/generic/include/clc/math/gentype.inc rename to libclc/clc/include/clc/math/gentype.inc diff --git a/libclc/clc/include/clc/shared/clc_clamp.h b/libclc/clc/include/clc/shared/clc_clamp.h new file mode 100644 index 0000000000000..5c044c9a1a510 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_clamp.h @@ -0,0 +1,15 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible clamp +#define __clc_clamp clamp +#else + +#include +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_clamp.inc b/libclc/clc/include/clc/shared/clc_clamp.inc new file mode 100644 index 0000000000000..cf6b0b2789bc5 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_clamp.inc @@ -0,0 +1,9 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, + __CLC_GENTYPE y, + __CLC_GENTYPE z); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, + __CLC_SCALAR_GENTYPE y, + __CLC_SCALAR_GENTYPE z); +#endif diff --git a/libclc/clc/include/clc/shared/clc_max.h b/libclc/clc/include/clc/shared/clc_max.h new file mode 100644 index 0000000000000..2825640f6c291 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_max.h @@ -0,0 +1,12 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible max +#define __clc_max max +#else + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_max.inc b/libclc/clc/include/clc/shared/clc_max.inc new file mode 100644 index 0000000000000..bddb3fa3d920c --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_max.inc @@ -0,0 +1,7 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, + __CLC_GENTYPE b); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b); +#endif diff --git a/libclc/clc/include/clc/shared/clc_min.h b/libclc/clc/include/clc/shared/clc_min.h new file mode 100644 index 0000000000000..0b7ee140b8f45 --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_min.h @@ -0,0 +1,12 @@ +#if defined(CLC_CLSPV) || defined(CLC_SPIRV) +// clspv and spir-v targets provide their own OpenCL-compatible min +#define __clc_min min +#else + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif diff --git a/libclc/clc/include/clc/shared/clc_min.inc b/libclc/clc/include/clc/shared/clc_min.inc new file mode 100644 index 0000000000000..3e1da96df43dd --- /dev/null +++ b/libclc/clc/include/clc/shared/clc_min.inc @@ -0,0 +1,7 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, + __CLC_GENTYPE b); + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b); +#endif diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index fa2e4f50b99cd..db523adb63836 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -1 +1,4 @@ geometric/clc_dot.cl +shared/clc_clamp.cl +shared/clc_max.cl +shared/clc_min.cl diff --git a/libclc/clc/lib/generic/shared/clc_clamp.cl b/libclc/clc/lib/generic/shared/clc_clamp.cl new file mode 100644 index 0000000000000..1d40da3cf2296 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_clamp.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/shared/clc_clamp.inc b/libclc/clc/lib/generic/shared/clc_clamp.inc new file mode 100644 index 0000000000000..da67cd2ad69db --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_clamp.inc @@ -0,0 +1,14 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, + __CLC_GENTYPE y, + __CLC_GENTYPE z) { + return (x > z ? z : (x < y ? y : x)); +} + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_clamp(__CLC_GENTYPE x, + __CLC_SCALAR_GENTYPE y, + __CLC_SCALAR_GENTYPE z) { + return (x > (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z + : (x < (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x)); +} +#endif diff --git a/libclc/clc/lib/generic/shared/clc_max.cl b/libclc/clc/lib/generic/shared/clc_max.cl new file mode 100644 index 0000000000000..e1050ed0007ee --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_max.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/shared/clc_max.inc b/libclc/clc/lib/generic/shared/clc_max.inc new file mode 100644 index 0000000000000..f4234cb359d86 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_max.inc @@ -0,0 +1,11 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, + __CLC_GENTYPE b) { + return (a > b ? a : b); +} + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_max(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { + return (a > (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b); +} +#endif diff --git a/libclc/clc/lib/generic/shared/clc_min.cl b/libclc/clc/lib/generic/shared/clc_min.cl new file mode 100644 index 0000000000000..12a26f5352407 --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_min.cl @@ -0,0 +1,7 @@ +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/shared/clc_min.inc b/libclc/clc/lib/generic/shared/clc_min.inc new file mode 100644 index 0000000000000..e9c85ddd3affa --- /dev/null +++ b/libclc/clc/lib/generic/shared/clc_min.inc @@ -0,0 +1,11 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, + __CLC_GENTYPE b) { + return (b < a ? b : a); +} + +#ifndef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_min(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { + return (b < (__CLC_GENTYPE)a ? (__CLC_GENTYPE)b : a); +} +#endif diff --git a/libclc/generic/include/config.h b/libclc/generic/include/config.h index 2994199b02c5b..7aa5967f4eb68 100644 --- a/libclc/generic/include/config.h +++ b/libclc/generic/include/config.h @@ -20,6 +20,8 @@ * THE SOFTWARE. */ +#include + _CLC_DECL bool __clc_subnormals_disabled(); _CLC_DECL bool __clc_fp16_subnormals_supported(); _CLC_DECL bool __clc_fp32_subnormals_supported(); diff --git a/libclc/generic/lib/common/smoothstep.cl b/libclc/generic/lib/common/smoothstep.cl index 9f513eb379e19..1b6a74b89d2c2 100644 --- a/libclc/generic/lib/common/smoothstep.cl +++ b/libclc/generic/lib/common/smoothstep.cl @@ -46,7 +46,7 @@ SMOOTH_STEP_DEF(double, double, SMOOTH_STEP_IMPL_D); _CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, smoothstep, double, double, double); -#if !defined(CLC_SPIRV) && !defined(CLC_SPIRV64) +#if !defined(CLC_SPIRV) SMOOTH_STEP_DEF(float, double, SMOOTH_STEP_IMPL_D); SMOOTH_STEP_DEF(double, float, SMOOTH_STEP_IMPL_D); diff --git a/libclc/generic/lib/common/step.cl b/libclc/generic/lib/common/step.cl index 5d7c48780d4fa..8155b469fb210 100644 --- a/libclc/generic/lib/common/step.cl +++ b/libclc/generic/lib/common/step.cl @@ -45,7 +45,7 @@ STEP_DEF(double, double); _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double); _CLC_V_S_V_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, step, double, double); -#if !defined(CLC_SPIRV) && !defined(CLC_SPIRV64) +#if !defined(CLC_SPIRV) STEP_DEF(float, double); STEP_DEF(double, float); diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl index 35532a9532062..ce88f276cf2f2 100644 --- a/libclc/generic/lib/math/clc_hypot.cl +++ b/libclc/generic/lib/math/clc_hypot.cl @@ -21,78 +21,82 @@ */ #include +#include #include #include "config.h" #include "math.h" #include "../clcmacro.h" -// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result warrants it -_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) -{ - uint ux = as_uint(x); - uint aux = ux & EXSIGNBIT_SP32; - uint uy = as_uint(y); - uint auy = uy & EXSIGNBIT_SP32; - float retval; - int c = aux > auy; - ux = c ? aux : auy; - uy = c ? auy : aux; - - int xexp = clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126); - float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - float fx = as_float(ux) * fi_exp; - float fy = as_float(uy) * fi_exp; - retval = sqrt(mad(fx, fx, fy*fy)) * fx_exp; - - retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval; - retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32 ? as_float(PINFBITPATT_SP32) : retval; - return retval; +// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result +// warrants it +_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) { + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint uy = as_uint(y); + uint auy = uy & EXSIGNBIT_SP32; + float retval; + int c = aux > auy; + ux = c ? aux : auy; + uy = c ? auy : aux; + + int xexp = + __clc_clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126); + float fx_exp = as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float fi_exp = as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float fx = as_float(ux) * fi_exp; + float fy = as_float(uy) * fi_exp; + retval = sqrt(mad(fx, fx, fy * fy)) * fx_exp; + + retval = ux > PINFBITPATT_SP32 | uy == 0 ? as_float(ux) : retval; + retval = ux == PINFBITPATT_SP32 | uy == PINFBITPATT_SP32 + ? as_float(PINFBITPATT_SP32) + : retval; + return retval; } _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_hypot, float, float) #ifdef cl_khr_fp64 -_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) -{ - ulong ux = as_ulong(x) & ~SIGNBIT_DP64; - int xexp = ux >> EXPSHIFTBITS_DP64; - x = as_double(ux); +_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) { + ulong ux = as_ulong(x) & ~SIGNBIT_DP64; + int xexp = ux >> EXPSHIFTBITS_DP64; + x = as_double(ux); - ulong uy = as_ulong(y) & ~SIGNBIT_DP64; - int yexp = uy >> EXPSHIFTBITS_DP64; - y = as_double(uy); + ulong uy = as_ulong(y) & ~SIGNBIT_DP64; + int yexp = uy >> EXPSHIFTBITS_DP64; + y = as_double(uy); - int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500; - double preadjust = c ? 0x1.0p-600 : 1.0; - double postadjust = c ? 0x1.0p+600 : 1.0; + int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500; + double preadjust = c ? 0x1.0p-600 : 1.0; + double postadjust = c ? 0x1.0p+600 : 1.0; - c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500; - preadjust = c ? 0x1.0p+600 : preadjust; - postadjust = c ? 0x1.0p-600 : postadjust; + c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500; + preadjust = c ? 0x1.0p+600 : preadjust; + postadjust = c ? 0x1.0p-600 : postadjust; - double ax = x * preadjust; - double ay = y * preadjust; + double ax = x * preadjust; + double ay = y * preadjust; - // The post adjust may overflow, but this can't be avoided in any case - double r = sqrt(fma(ax, ax, ay*ay)) * postadjust; + // The post adjust may overflow, but this can't be avoided in any case + double r = sqrt(fma(ax, ax, ay * ay)) * postadjust; - // If the difference in exponents between x and y is large - double s = x + y; - c = abs(xexp - yexp) > MANTLENGTH_DP64 + 1; - r = c ? s : r; + // If the difference in exponents between x and y is large + double s = x + y; + c = abs(xexp - yexp) > MANTLENGTH_DP64 + 1; + r = c ? s : r; - // Check for NaN - //c = x != x | y != y; - c = isnan(x) | isnan(y); - r = c ? as_double(QNANBITPATT_DP64) : r; + // Check for NaN + // c = x != x | y != y; + c = isnan(x) | isnan(y); + r = c ? as_double(QNANBITPATT_DP64) : r; - // If either is Inf, we must return Inf - c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64); - r = c ? as_double(PINFBITPATT_DP64) : r; + // If either is Inf, we must return Inf + c = x == as_double(PINFBITPATT_DP64) | y == as_double(PINFBITPATT_DP64); + r = c ? as_double(PINFBITPATT_DP64) : r; - return r; + return r; } -_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, double) +_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, + double) #endif diff --git a/libclc/generic/lib/math/clc_ldexp.cl b/libclc/generic/lib/math/clc_ldexp.cl index ae6117b7b2922..438c31835a364 100644 --- a/libclc/generic/lib/math/clc_ldexp.cl +++ b/libclc/generic/lib/math/clc_ldexp.cl @@ -20,76 +20,78 @@ * THE SOFTWARE. */ -#include -#include "config.h" #include "../clcmacro.h" +#include "config.h" #include "math.h" +#include +#include _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) { - if (!__clc_fp32_subnormals_supported()) { - - // This treats subnormals as zeros - int i = as_int(x); - int e = (i >> 23) & 0xff; - int m = i & 0x007fffff; - int s = i & 0x80000000; - int v = add_sat(e, n); - v = clamp(v, 0, 0xff); - int mr = e == 0 | v == 0 | v == 0xff ? 0 : m; - int c = e == 0xff; - mr = c ? m : mr; - int er = c ? e : v; - er = e ? er : e; - return as_float( s | (er << 23) | mr ); - } - - /* supports denormal values */ - const int multiplier = 24; - float val_f; - uint val_ui; - uint sign; - int exponent; - val_ui = as_uint(x); - sign = val_ui & 0x80000000; - val_ui = val_ui & 0x7fffffff;/* remove the sign bit */ - int val_x = val_ui; - - exponent = val_ui >> 23; /* get the exponent */ - int dexp = exponent; - - /* denormal support */ - int fbh = 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23); - int dexponent = 25 - fbh; - uint dval_ui = (( (val_ui << fbh) & 0x007fffff) | (dexponent << 23)); - int ex = dexponent + n - multiplier; - dexponent = ex; - uint val = sign | (ex << 23) | (dval_ui & 0x007fffff); - int ex1 = dexponent + multiplier; - ex1 = -ex1 +25; - dval_ui = (((dval_ui & 0x007fffff )| 0x800000) >> ex1); - dval_ui = dexponent > 0 ? val :dval_ui; - dval_ui = dexponent > 254 ? 0x7f800000 :dval_ui; /*overflow*/ - dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/ - dval_ui = dval_ui | sign; - val_f = as_float(dval_ui); - - exponent += n; - - val = sign | (exponent << 23) | (val_ui & 0x007fffff); - ex1 = exponent + multiplier; - ex1 = -ex1 +25; - val_ui = (((val_ui & 0x007fffff )| 0x800000) >> ex1); - val_ui = exponent > 0 ? val :val_ui; - val_ui = exponent > 254 ? 0x7f800000 :val_ui; /*overflow*/ - val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/ - val_ui = val_ui | sign; - - val_ui = dexp == 0? dval_ui : val_ui; - val_f = as_float(val_ui); - - val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f; - return val_f; + if (!__clc_fp32_subnormals_supported()) { + + // This treats subnormals as zeros + int i = as_int(x); + int e = (i >> 23) & 0xff; + int m = i & 0x007fffff; + int s = i & 0x80000000; + int v = add_sat(e, n); + v = __clc_clamp(v, 0, 0xff); + int mr = e == 0 | v == 0 | v == 0xff ? 0 : m; + int c = e == 0xff; + mr = c ? m : mr; + int er = c ? e : v; + er = e ? er : e; + return as_float(s | (er << 23) | mr); + } + + /* supports denormal values */ + const int multiplier = 24; + float val_f; + uint val_ui; + uint sign; + int exponent; + val_ui = as_uint(x); + sign = val_ui & 0x80000000; + val_ui = val_ui & 0x7fffffff; /* remove the sign bit */ + int val_x = val_ui; + + exponent = val_ui >> 23; /* get the exponent */ + int dexp = exponent; + + /* denormal support */ + int fbh = + 127 - (as_uint((float)(as_float(val_ui | 0x3f800000) - 1.0f)) >> 23); + int dexponent = 25 - fbh; + uint dval_ui = (((val_ui << fbh) & 0x007fffff) | (dexponent << 23)); + int ex = dexponent + n - multiplier; + dexponent = ex; + uint val = sign | (ex << 23) | (dval_ui & 0x007fffff); + int ex1 = dexponent + multiplier; + ex1 = -ex1 + 25; + dval_ui = (((dval_ui & 0x007fffff) | 0x800000) >> ex1); + dval_ui = dexponent > 0 ? val : dval_ui; + dval_ui = dexponent > 254 ? 0x7f800000 : dval_ui; /*overflow*/ + dval_ui = dexponent < -multiplier ? 0 : dval_ui; /*underflow*/ + dval_ui = dval_ui | sign; + val_f = as_float(dval_ui); + + exponent += n; + + val = sign | (exponent << 23) | (val_ui & 0x007fffff); + ex1 = exponent + multiplier; + ex1 = -ex1 + 25; + val_ui = (((val_ui & 0x007fffff) | 0x800000) >> ex1); + val_ui = exponent > 0 ? val : val_ui; + val_ui = exponent > 254 ? 0x7f800000 : val_ui; /*overflow*/ + val_ui = exponent < -multiplier ? 0 : val_ui; /*underflow*/ + val_ui = val_ui | sign; + + val_ui = dexp == 0 ? dval_ui : val_ui; + val_f = as_float(val_ui); + + val_f = isnan(x) | isinf(x) | val_x == 0 ? x : val_f; + return val_f; } #ifdef cl_khr_fp64 @@ -97,32 +99,32 @@ _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float x, int n) { #pragma OPENCL EXTENSION cl_khr_fp64 : enable _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) { - long l = as_ulong(x); - int e = (l >> 52) & 0x7ff; - long s = l & 0x8000000000000000; + long l = as_ulong(x); + int e = (l >> 52) & 0x7ff; + long s = l & 0x8000000000000000; - ulong ux = as_ulong(x * 0x1.0p+53); - int de = ((int)(ux >> 52) & 0x7ff) - 53; - int c = e == 0; - e = c ? de: e; + ulong ux = as_ulong(x * 0x1.0p+53); + int de = ((int)(ux >> 52) & 0x7ff) - 53; + int c = e == 0; + e = c ? de : e; - ux = c ? ux : l; + ux = c ? ux : l; - int v = e + n; - v = clamp(v, -0x7ff, 0x7ff); + int v = e + n; + v = __clc_clamp(v, -0x7ff, 0x7ff); - ux &= ~EXPBITS_DP64; + ux &= ~EXPBITS_DP64; - double mr = as_double(ux | ((ulong)(v+53) << 52)); - mr = mr * 0x1.0p-53; + double mr = as_double(ux | ((ulong)(v + 53) << 52)); + mr = mr * 0x1.0p-53; - mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr; + mr = v > 0 ? as_double(ux | ((ulong)v << 52)) : mr; - mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr; - mr = v < -53 ? as_double(s) : mr; + mr = v == 0x7ff ? as_double(s | PINFBITPATT_DP64) : mr; + mr = v < -53 ? as_double(s) : mr; - mr = ((n == 0) | isinf(x) | (x == 0) ) ? x : mr; - return mr; + mr = ((n == 0) | isinf(x) | (x == 0)) ? x : mr; + return mr; } #endif @@ -132,7 +134,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) { #pragma OPENCL EXTENSION cl_khr_fp16 : enable _CLC_OVERLOAD _CLC_DEF half __clc_ldexp(half x, int n) { - return (half)__clc_ldexp((float)x, n); + return (half)__clc_ldexp((float)x, n); } _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_ldexp, half, int); diff --git a/libclc/generic/lib/math/math.h b/libclc/generic/lib/math/math.h index 351e37dc3f12c..d5ef0871e5201 100644 --- a/libclc/generic/lib/math/math.h +++ b/libclc/generic/lib/math/math.h @@ -40,7 +40,7 @@ #if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__ #define HAVE_HW_FMA32() (0) -#elif defined CLC_SPIRV || defined CLC_SPIRV64 +#elif defined(CLC_SPIRV) bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void); #define HAVE_HW_FMA32() __clc_runtime_has_hw_fma32() #else diff --git a/libclc/generic/lib/shared/clamp.cl b/libclc/generic/lib/shared/clamp.cl index b946220485bea..f470fc822f756 100644 --- a/libclc/generic/lib/shared/clamp.cl +++ b/libclc/generic/lib/shared/clamp.cl @@ -1,4 +1,5 @@ #include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/clamp.inc b/libclc/generic/lib/shared/clamp.inc index c918f9c499e70..7e02cb2e1c47e 100644 --- a/libclc/generic/lib/shared/clamp.inc +++ b/libclc/generic/lib/shared/clamp.inc @@ -1,9 +1,9 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z) { - return (x > z ? z : (x < y ? y : x)); + return __clc_clamp(x, y, z); } #ifndef __CLC_SCALAR _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE clamp(__CLC_GENTYPE x, __CLC_SCALAR_GENTYPE y, __CLC_SCALAR_GENTYPE z) { - return (x > (__CLC_GENTYPE)z ? (__CLC_GENTYPE)z : (x < (__CLC_GENTYPE)y ? (__CLC_GENTYPE)y : x)); + return __clc_clamp(x, y, z); } #endif diff --git a/libclc/generic/lib/shared/max.cl b/libclc/generic/lib/shared/max.cl index eb573cdbca86b..2266d5905afd6 100644 --- a/libclc/generic/lib/shared/max.cl +++ b/libclc/generic/lib/shared/max.cl @@ -1,4 +1,5 @@ #include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/max.inc b/libclc/generic/lib/shared/max.inc index 75a24c077d1ab..ec433a89c6e92 100644 --- a/libclc/generic/lib/shared/max.inc +++ b/libclc/generic/lib/shared/max.inc @@ -1,9 +1,10 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_GENTYPE b) { - return (a > b ? a : b); + return __clc_max(a, b); } #ifndef __CLC_SCALAR -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { - return (a > (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE max(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { + return __clc_max(a, b); } #endif diff --git a/libclc/generic/lib/shared/min.cl b/libclc/generic/lib/shared/min.cl index 19a7d796c7b99..f5c4d57f4b8d8 100644 --- a/libclc/generic/lib/shared/min.cl +++ b/libclc/generic/lib/shared/min.cl @@ -1,4 +1,5 @@ #include +#include #define __CLC_BODY #include diff --git a/libclc/generic/lib/shared/min.inc b/libclc/generic/lib/shared/min.inc index e15e05591342e..6a00944cbe35e 100644 --- a/libclc/generic/lib/shared/min.inc +++ b/libclc/generic/lib/shared/min.inc @@ -1,9 +1,10 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) { - return (b < a ? b : a); + return __clc_min(a, b); } #ifndef __CLC_SCALAR -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { - return (b < (__CLC_GENTYPE)a ? (__CLC_GENTYPE)b : a); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { + return __clc_min(a, b); } #endif diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h index 8a98451a8f965..d76bf4903aaa9 100644 --- a/libcxx/include/__algorithm/copy_move_common.h +++ b/libcxx/include/__algorithm/copy_move_common.h @@ -13,6 +13,7 @@ #include <__algorithm/unwrap_iter.h> #include <__algorithm/unwrap_range.h> #include <__config> +#include <__cstddef/size_t.h> #include <__iterator/iterator_traits.h> #include <__memory/pointer_traits.h> #include <__string/constexpr_c_functions.h> @@ -24,7 +25,6 @@ #include <__type_traits/is_volatile.h> #include <__utility/move.h> #include <__utility/pair.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__algorithm/inplace_merge.h b/libcxx/include/__algorithm/inplace_merge.h index 62a8bc53e23f3..ad3fe6a7a505d 100644 --- a/libcxx/include/__algorithm/inplace_merge.h +++ b/libcxx/include/__algorithm/inplace_merge.h @@ -18,6 +18,7 @@ #include <__algorithm/rotate.h> #include <__algorithm/upper_bound.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__functional/identity.h> #include <__iterator/advance.h> #include <__iterator/distance.h> diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index cb83347584b1a..556bd4216307d 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -15,6 +15,7 @@ #include <__algorithm/simd_utils.h> #include <__algorithm/unwrap_iter.h> #include <__config> +#include <__cstddef/size_t.h> #include <__functional/identity.h> #include <__iterator/aliasing_iterator.h> #include <__iterator/iterator_traits.h> @@ -27,7 +28,6 @@ #include <__utility/move.h> #include <__utility/pair.h> #include <__utility/unreachable.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__algorithm/shuffle.h b/libcxx/include/__algorithm/shuffle.h index c9c56ce8c2c0b..7177fbb469ba7 100644 --- a/libcxx/include/__algorithm/shuffle.h +++ b/libcxx/include/__algorithm/shuffle.h @@ -11,12 +11,12 @@ #include <__algorithm/iterator_operations.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/iterator_traits.h> #include <__random/uniform_int_distribution.h> #include <__utility/forward.h> #include <__utility/move.h> #include <__utility/swap.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h index 56518dafa3193..4e3e4f2b9404e 100644 --- a/libcxx/include/__algorithm/simd_utils.h +++ b/libcxx/include/__algorithm/simd_utils.h @@ -14,10 +14,10 @@ #include <__bit/countl.h> #include <__bit/countr.h> #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_same.h> #include <__utility/integer_sequence.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__algorithm/stable_partition.h b/libcxx/include/__algorithm/stable_partition.h index 5df5e8eaf689b..0438f589a39d7 100644 --- a/libcxx/include/__algorithm/stable_partition.h +++ b/libcxx/include/__algorithm/stable_partition.h @@ -12,6 +12,7 @@ #include <__algorithm/iterator_operations.h> #include <__algorithm/rotate.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/advance.h> #include <__iterator/distance.h> #include <__iterator/iterator_traits.h> diff --git a/libcxx/include/__algorithm/stable_sort.h b/libcxx/include/__algorithm/stable_sort.h index ec556aad82e8d..43f591ac02b01 100644 --- a/libcxx/include/__algorithm/stable_sort.h +++ b/libcxx/include/__algorithm/stable_sort.h @@ -15,6 +15,7 @@ #include <__algorithm/iterator_operations.h> #include <__algorithm/sort.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__debug_utils/strict_weak_ordering_check.h> #include <__iterator/iterator_traits.h> #include <__memory/destruct_n.h> diff --git a/libcxx/include/__atomic/aliases.h b/libcxx/include/__atomic/aliases.h index afc64eaaa69e7..3c84747bebb81 100644 --- a/libcxx/include/__atomic/aliases.h +++ b/libcxx/include/__atomic/aliases.h @@ -14,9 +14,10 @@ #include <__atomic/contention_t.h> #include <__atomic/is_always_lock_free.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> +#include <__cstddef/size_t.h> #include <__type_traits/conditional.h> #include <__type_traits/make_unsigned.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h index af6d12b5e4ce9..113475cb1f007 100644 --- a/libcxx/include/__atomic/atomic.h +++ b/libcxx/include/__atomic/atomic.h @@ -14,6 +14,7 @@ #include <__atomic/cxx_atomic_impl.h> #include <__atomic/memory_order.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__functional/operations.h> #include <__memory/addressof.h> #include <__type_traits/enable_if.h> @@ -25,7 +26,6 @@ #include <__type_traits/remove_pointer.h> #include <__type_traits/remove_volatile.h> #include <__utility/forward.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__atomic/atomic_ref.h b/libcxx/include/__atomic/atomic_ref.h index 465cd9a77ea79..eef15983b9833 100644 --- a/libcxx/include/__atomic/atomic_ref.h +++ b/libcxx/include/__atomic/atomic_ref.h @@ -25,10 +25,11 @@ #include <__concepts/arithmetic.h> #include <__concepts/same_as.h> #include <__config> +#include <__cstddef/byte.h> +#include <__cstddef/ptrdiff_t.h> #include <__memory/addressof.h> #include <__type_traits/has_unique_object_representation.h> #include <__type_traits/is_trivially_copyable.h> -#include #include #include diff --git a/libcxx/include/__atomic/cxx_atomic_impl.h b/libcxx/include/__atomic/cxx_atomic_impl.h index 18e88aa97bec7..86a57d1d5d8ff 100644 --- a/libcxx/include/__atomic/cxx_atomic_impl.h +++ b/libcxx/include/__atomic/cxx_atomic_impl.h @@ -12,11 +12,12 @@ #include <__atomic/memory_order.h> #include <__atomic/to_gcc_order.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__memory/addressof.h> +#include <__type_traits/enable_if.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_trivially_copyable.h> #include <__type_traits/remove_const.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__charconv/from_chars_floating_point.h b/libcxx/include/__charconv/from_chars_floating_point.h index 5cd3fc4a41ea1..811e518a81db7 100644 --- a/libcxx/include/__charconv/from_chars_floating_point.h +++ b/libcxx/include/__charconv/from_chars_floating_point.h @@ -14,8 +14,8 @@ #include <__charconv/chars_format.h> #include <__charconv/from_chars_result.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__system_error/errc.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__charconv/to_chars_integral.h b/libcxx/include/__charconv/to_chars_integral.h index fd92be4b4ce91..710299df9b4da 100644 --- a/libcxx/include/__charconv/to_chars_integral.h +++ b/libcxx/include/__charconv/to_chars_integral.h @@ -18,6 +18,7 @@ #include <__charconv/to_chars_result.h> #include <__charconv/traits.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__system_error/errc.h> #include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> @@ -26,7 +27,6 @@ #include <__type_traits/make_32_64_or_128_bit.h> #include <__type_traits/make_unsigned.h> #include <__utility/unreachable.h> -#include #include #include diff --git a/libcxx/include/__compare/common_comparison_category.h b/libcxx/include/__compare/common_comparison_category.h index 7aeb3da03a4f4..215922abad6b0 100644 --- a/libcxx/include/__compare/common_comparison_category.h +++ b/libcxx/include/__compare/common_comparison_category.h @@ -11,8 +11,8 @@ #include <__compare/ordering.h> #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/is_same.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__concepts/swappable.h b/libcxx/include/__concepts/swappable.h index d339488a087a5..985c733021a0d 100644 --- a/libcxx/include/__concepts/swappable.h +++ b/libcxx/include/__concepts/swappable.h @@ -14,6 +14,7 @@ #include <__concepts/common_reference_with.h> #include <__concepts/constructible.h> #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/extent.h> #include <__type_traits/is_nothrow_assignable.h> #include <__type_traits/is_nothrow_constructible.h> @@ -22,7 +23,6 @@ #include <__utility/forward.h> #include <__utility/move.h> #include <__utility/swap.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__config b/libcxx/include/__config index 1cf80a46686ab..308dcc998fc1c 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -14,6 +14,7 @@ #include <__configuration/abi.h> #include <__configuration/availability.h> #include <__configuration/compiler.h> +#include <__configuration/language.h> #include <__configuration/platform.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER diff --git a/libcxx/include/__coroutine/coroutine_handle.h b/libcxx/include/__coroutine/coroutine_handle.h index 4557a6643c239..e2cde20498d84 100644 --- a/libcxx/include/__coroutine/coroutine_handle.h +++ b/libcxx/include/__coroutine/coroutine_handle.h @@ -11,11 +11,12 @@ #include <__assert> #include <__config> +#include <__cstddef/nullptr_t.h> +#include <__cstddef/size_t.h> #include <__functional/hash.h> #include <__memory/addressof.h> #include <__type_traits/remove_cv.h> #include -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/adjacent_find.h b/libcxx/include/__cxx03/__algorithm/adjacent_find.h index 6f15456e3a4d0..88036db84de89 100644 --- a/libcxx/include/__cxx03/__algorithm/adjacent_find.h +++ b/libcxx/include/__cxx03/__algorithm/adjacent_find.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___ALGORITHM_ADJACENT_FIND_H #define _LIBCPP___ALGORITHM_ADJACENT_FIND_H -#include <__algorithm/comp.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/all_of.h b/libcxx/include/__cxx03/__algorithm/all_of.h index ec84eea759296..b32d97241506e 100644 --- a/libcxx/include/__cxx03/__algorithm/all_of.h +++ b/libcxx/include/__cxx03/__algorithm/all_of.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___ALGORITHM_ALL_OF_H #define _LIBCPP___ALGORITHM_ALL_OF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/any_of.h b/libcxx/include/__cxx03/__algorithm/any_of.h index b5ff778c4171d..bd7de96fbef40 100644 --- a/libcxx/include/__cxx03/__algorithm/any_of.h +++ b/libcxx/include/__cxx03/__algorithm/any_of.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___ALGORITHM_ANY_OF_H #define _LIBCPP___ALGORITHM_ANY_OF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/binary_search.h b/libcxx/include/__cxx03/__algorithm/binary_search.h index 6065fc37274dc..a93c62b32d742 100644 --- a/libcxx/include/__cxx03/__algorithm/binary_search.h +++ b/libcxx/include/__cxx03/__algorithm/binary_search.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_BINARY_SEARCH_H #define _LIBCPP___ALGORITHM_BINARY_SEARCH_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/lower_bound.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/clamp.h b/libcxx/include/__cxx03/__algorithm/clamp.h index 1a5a3d0744be9..e5e4dbf774dc3 100644 --- a/libcxx/include/__cxx03/__algorithm/clamp.h +++ b/libcxx/include/__cxx03/__algorithm/clamp.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_CLAMP_H #define _LIBCPP___ALGORITHM_CLAMP_H -#include <__algorithm/comp.h> -#include <__assert> -#include <__config> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/comp.h b/libcxx/include/__cxx03/__algorithm/comp.h index a0fa88d6d2acd..f01e395dc21ea 100644 --- a/libcxx/include/__cxx03/__algorithm/comp.h +++ b/libcxx/include/__cxx03/__algorithm/comp.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___ALGORITHM_COMP_H #define _LIBCPP___ALGORITHM_COMP_H -#include <__config> -#include <__type_traits/desugars_to.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/desugars_to.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/comp_ref_type.h b/libcxx/include/__cxx03/__algorithm/comp_ref_type.h index c367fbb91ac28..413bf30c6a64c 100644 --- a/libcxx/include/__cxx03/__algorithm/comp_ref_type.h +++ b/libcxx/include/__cxx03/__algorithm/comp_ref_type.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_COMP_REF_TYPE_H #define _LIBCPP___ALGORITHM_COMP_REF_TYPE_H -#include <__assert> -#include <__config> -#include <__utility/declval.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/copy.h b/libcxx/include/__cxx03/__algorithm/copy.h index 0890b895f5409..6de8df3f980fe 100644 --- a/libcxx/include/__cxx03/__algorithm/copy.h +++ b/libcxx/include/__cxx03/__algorithm/copy.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_COPY_H #define _LIBCPP___ALGORITHM_COPY_H -#include <__algorithm/copy_move_common.h> -#include <__algorithm/for_each_segment.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/min.h> -#include <__config> -#include <__iterator/segmented_iterator.h> -#include <__type_traits/common_type.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/copy_move_common.h> +#include <__cxx03/__algorithm/for_each_segment.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/copy_backward.h b/libcxx/include/__cxx03/__algorithm/copy_backward.h index 73dc846a975a4..dd7ff8ada5280 100644 --- a/libcxx/include/__cxx03/__algorithm/copy_backward.h +++ b/libcxx/include/__cxx03/__algorithm/copy_backward.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_COPY_BACKWARD_H #define _LIBCPP___ALGORITHM_COPY_BACKWARD_H -#include <__algorithm/copy_move_common.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/min.h> -#include <__config> -#include <__iterator/segmented_iterator.h> -#include <__type_traits/common_type.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/copy_move_common.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/copy_if.h b/libcxx/include/__cxx03/__algorithm/copy_if.h index 228e4d22323e3..345b12878d333 100644 --- a/libcxx/include/__cxx03/__algorithm/copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/copy_if.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_COPY_IF_H #define _LIBCPP___ALGORITHM_COPY_IF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/copy_move_common.h b/libcxx/include/__cxx03/__algorithm/copy_move_common.h index 8a98451a8f965..c598307025176 100644 --- a/libcxx/include/__cxx03/__algorithm/copy_move_common.h +++ b/libcxx/include/__cxx03/__algorithm/copy_move_common.h @@ -9,29 +9,29 @@ #ifndef _LIBCPP___ALGORITHM_COPY_MOVE_COMMON_H #define _LIBCPP___ALGORITHM_COPY_MOVE_COMMON_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/unwrap_iter.h> -#include <__algorithm/unwrap_range.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__memory/pointer_traits.h> -#include <__string/constexpr_c_functions.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_always_bitcastable.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_trivially_assignable.h> -#include <__type_traits/is_volatile.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__algorithm/unwrap_range.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__string/constexpr_c_functions.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_always_bitcastable.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_trivially_assignable.h> +#include <__cxx03/__type_traits/is_volatile.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/copy_n.h b/libcxx/include/__cxx03/__algorithm/copy_n.h index f93f39203a7e3..14f1402944335 100644 --- a/libcxx/include/__cxx03/__algorithm/copy_n.h +++ b/libcxx/include/__cxx03/__algorithm/copy_n.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_COPY_N_H #define _LIBCPP___ALGORITHM_COPY_N_H -#include <__algorithm/copy.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/enable_if.h> -#include <__utility/convert_to_integral.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__utility/convert_to_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/count.h b/libcxx/include/__cxx03/__algorithm/count.h index 1cfe7f631ac1b..b7024dc8c6318 100644 --- a/libcxx/include/__cxx03/__algorithm/count.h +++ b/libcxx/include/__cxx03/__algorithm/count.h @@ -10,22 +10,22 @@ #ifndef _LIBCPP___ALGORITHM_COUNT_H #define _LIBCPP___ALGORITHM_COUNT_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/min.h> -#include <__bit/invert_if.h> -#include <__bit/popcount.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__fwd/bit_reference.h> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__bit/invert_if.h> +#include <__cxx03/__bit/popcount.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__fwd/bit_reference.h> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/count_if.h b/libcxx/include/__cxx03/__algorithm/count_if.h index 25782069d0327..eeb42052b08ae 100644 --- a/libcxx/include/__cxx03/__algorithm/count_if.h +++ b/libcxx/include/__cxx03/__algorithm/count_if.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___ALGORITHM_COUNT_IF_H #define _LIBCPP___ALGORITHM_COUNT_IF_H -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/equal.h b/libcxx/include/__cxx03/__algorithm/equal.h index bfc8f72f6eb19..a4d0a999b1819 100644 --- a/libcxx/include/__cxx03/__algorithm/equal.h +++ b/libcxx/include/__cxx03/__algorithm/equal.h @@ -10,27 +10,27 @@ #ifndef _LIBCPP___ALGORITHM_EQUAL_H #define _LIBCPP___ALGORITHM_EQUAL_H -#include <__algorithm/comp.h> -#include <__algorithm/unwrap_iter.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__string/constexpr_c_functions.h> -#include <__type_traits/desugars_to.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_equality_comparable.h> -#include <__type_traits/is_volatile.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__string/constexpr_c_functions.h> +#include <__cxx03/__type_traits/desugars_to.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_equality_comparable.h> +#include <__cxx03/__type_traits/is_volatile.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/equal_range.h b/libcxx/include/__cxx03/__algorithm/equal_range.h index 09bbf8f006021..9abda2bd2e36f 100644 --- a/libcxx/include/__cxx03/__algorithm/equal_range.h +++ b/libcxx/include/__cxx03/__algorithm/equal_range.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___ALGORITHM_EQUAL_RANGE_H #define _LIBCPP___ALGORITHM_EQUAL_RANGE_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/half_positive.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/lower_bound.h> -#include <__algorithm/upper_bound.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/advance.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__type_traits/is_callable.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/half_positive.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__algorithm/upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/fill.h b/libcxx/include/__cxx03/__algorithm/fill.h index 1ce3eadb013d0..5da0f4457daa6 100644 --- a/libcxx/include/__cxx03/__algorithm/fill.h +++ b/libcxx/include/__cxx03/__algorithm/fill.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_FILL_H #define _LIBCPP___ALGORITHM_FILL_H -#include <__algorithm/fill_n.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/fill_n.h b/libcxx/include/__cxx03/__algorithm/fill_n.h index f29633f88087f..fd548f27056a1 100644 --- a/libcxx/include/__cxx03/__algorithm/fill_n.h +++ b/libcxx/include/__cxx03/__algorithm/fill_n.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___ALGORITHM_FILL_N_H #define _LIBCPP___ALGORITHM_FILL_N_H -#include <__algorithm/min.h> -#include <__config> -#include <__fwd/bit_reference.h> -#include <__iterator/iterator_traits.h> -#include <__memory/pointer_traits.h> -#include <__utility/convert_to_integral.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/bit_reference.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__utility/convert_to_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/find.h b/libcxx/include/__cxx03/__algorithm/find.h index 7f58dbb13a577..7a48a449c897b 100644 --- a/libcxx/include/__cxx03/__algorithm/find.h +++ b/libcxx/include/__cxx03/__algorithm/find.h @@ -10,25 +10,25 @@ #ifndef _LIBCPP___ALGORITHM_FIND_H #define _LIBCPP___ALGORITHM_FIND_H -#include <__algorithm/find_segment_if.h> -#include <__algorithm/min.h> -#include <__algorithm/unwrap_iter.h> -#include <__bit/countr.h> -#include <__bit/invert_if.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__fwd/bit_reference.h> -#include <__iterator/segmented_iterator.h> -#include <__string/constexpr_c_functions.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_signed.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/find_segment_if.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__bit/countr.h> +#include <__cxx03/__bit/invert_if.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__fwd/bit_reference.h> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__string/constexpr_c_functions.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/limits> #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS -# include +# include <__cxx03/cwchar> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -36,7 +36,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/find_end.h b/libcxx/include/__cxx03/__algorithm/find_end.h index 7e08e7953534e..34a45e2c66fae 100644 --- a/libcxx/include/__cxx03/__algorithm/find_end.h +++ b/libcxx/include/__cxx03/__algorithm/find_end.h @@ -10,17 +10,17 @@ #ifndef _LIBCPP___ALGORITHM_FIND_END_OF_H #define _LIBCPP___ALGORITHM_FIND_END_OF_H -#include <__algorithm/comp.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/search.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/advance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/reverse_iterator.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/search.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/find_first_of.h b/libcxx/include/__cxx03/__algorithm/find_first_of.h index 6b99f562f8804..05eb85fd663d2 100644 --- a/libcxx/include/__cxx03/__algorithm/find_first_of.h +++ b/libcxx/include/__cxx03/__algorithm/find_first_of.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___ALGORITHM_FIND_FIRST_OF_H #define _LIBCPP___ALGORITHM_FIND_FIRST_OF_H -#include <__algorithm/comp.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/find_if.h b/libcxx/include/__cxx03/__algorithm/find_if.h index 22092d352b06e..b0150e539f9ab 100644 --- a/libcxx/include/__cxx03/__algorithm/find_if.h +++ b/libcxx/include/__cxx03/__algorithm/find_if.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___ALGORITHM_FIND_IF_H #define _LIBCPP___ALGORITHM_FIND_IF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/find_if_not.h b/libcxx/include/__cxx03/__algorithm/find_if_not.h index cc2001967f0c5..67d9a7deb2edd 100644 --- a/libcxx/include/__cxx03/__algorithm/find_if_not.h +++ b/libcxx/include/__cxx03/__algorithm/find_if_not.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___ALGORITHM_FIND_IF_NOT_H #define _LIBCPP___ALGORITHM_FIND_IF_NOT_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/find_segment_if.h b/libcxx/include/__cxx03/__algorithm/find_segment_if.h index 9d6064f3e283a..a6c016234418e 100644 --- a/libcxx/include/__cxx03/__algorithm/find_segment_if.h +++ b/libcxx/include/__cxx03/__algorithm/find_segment_if.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___ALGORITHM_FIND_SEGMENT_IF_H #define _LIBCPP___ALGORITHM_FIND_SEGMENT_IF_H -#include <__config> -#include <__iterator/segmented_iterator.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/segmented_iterator.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/fold.h b/libcxx/include/__cxx03/__algorithm/fold.h index 255658f523249..7e9c745bbbdb2 100644 --- a/libcxx/include/__cxx03/__algorithm/fold.h +++ b/libcxx/include/__cxx03/__algorithm/fold.h @@ -10,30 +10,30 @@ #ifndef _LIBCPP___ALGORITHM_FOLD_H #define _LIBCPP___ALGORITHM_FOLD_H -#include <__concepts/assignable.h> -#include <__concepts/convertible_to.h> -#include <__concepts/invocable.h> -#include <__concepts/movable.h> -#include <__config> -#include <__functional/invoke.h> -#include <__functional/reference_wrapper.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__type_traits/decay.h> -#include <__type_traits/invoke.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__concepts/movable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/reference_wrapper.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/invoke.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/for_each.h b/libcxx/include/__cxx03/__algorithm/for_each.h index 259e527f87f91..d2b19310c60f5 100644 --- a/libcxx/include/__cxx03/__algorithm/for_each.h +++ b/libcxx/include/__cxx03/__algorithm/for_each.h @@ -10,20 +10,20 @@ #ifndef _LIBCPP___ALGORITHM_FOR_EACH_H #define _LIBCPP___ALGORITHM_FOR_EACH_H -#include <__algorithm/for_each_segment.h> -#include <__config> -#include <__iterator/segmented_iterator.h> -#include <__ranges/movable_box.h> -#include <__type_traits/enable_if.h> -#include <__utility/in_place.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/for_each_segment.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/for_each_n.h b/libcxx/include/__cxx03/__algorithm/for_each_n.h index fce380b49df3e..7b8c40eacf967 100644 --- a/libcxx/include/__cxx03/__algorithm/for_each_n.h +++ b/libcxx/include/__cxx03/__algorithm/for_each_n.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_H #define _LIBCPP___ALGORITHM_FOR_EACH_N_H -#include <__config> -#include <__utility/convert_to_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/convert_to_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/for_each_segment.h b/libcxx/include/__cxx03/__algorithm/for_each_segment.h index 93aa8259b2f7f..50c4b28deba02 100644 --- a/libcxx/include/__cxx03/__algorithm/for_each_segment.h +++ b/libcxx/include/__cxx03/__algorithm/for_each_segment.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___ALGORITHM_FOR_EACH_SEGMENT_H #define _LIBCPP___ALGORITHM_FOR_EACH_SEGMENT_H -#include <__config> -#include <__iterator/segmented_iterator.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/segmented_iterator.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/generate.h b/libcxx/include/__cxx03/__algorithm/generate.h index c95b527402f5d..14da75cd44ceb 100644 --- a/libcxx/include/__cxx03/__algorithm/generate.h +++ b/libcxx/include/__cxx03/__algorithm/generate.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_GENERATE_H #define _LIBCPP___ALGORITHM_GENERATE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/generate_n.h b/libcxx/include/__cxx03/__algorithm/generate_n.h index f36403fd0f94a..32cc86911815c 100644 --- a/libcxx/include/__cxx03/__algorithm/generate_n.h +++ b/libcxx/include/__cxx03/__algorithm/generate_n.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___ALGORITHM_GENERATE_N_H #define _LIBCPP___ALGORITHM_GENERATE_N_H -#include <__config> -#include <__utility/convert_to_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/convert_to_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/half_positive.h b/libcxx/include/__cxx03/__algorithm/half_positive.h index ebda0da372369..4378964d9d836 100644 --- a/libcxx/include/__cxx03/__algorithm/half_positive.h +++ b/libcxx/include/__cxx03/__algorithm/half_positive.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___ALGORITHM_HALF_POSITIVE_H #define _LIBCPP___ALGORITHM_HALF_POSITIVE_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/make_unsigned.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/make_unsigned.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/in_found_result.h b/libcxx/include/__cxx03/__algorithm/in_found_result.h index a67ae387974c0..0f4895dab437e 100644 --- a/libcxx/include/__cxx03/__algorithm/in_found_result.h +++ b/libcxx/include/__cxx03/__algorithm/in_found_result.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___ALGORITHM_IN_FOUND_RESULT_H #define _LIBCPP___ALGORITHM_IN_FOUND_RESULT_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/in_fun_result.h b/libcxx/include/__cxx03/__algorithm/in_fun_result.h index a22069a9a8dda..998d4599ceac8 100644 --- a/libcxx/include/__cxx03/__algorithm/in_fun_result.h +++ b/libcxx/include/__cxx03/__algorithm/in_fun_result.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___ALGORITHM_IN_FUN_RESULT_H #define _LIBCPP___ALGORITHM_IN_FUN_RESULT_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/in_in_out_result.h b/libcxx/include/__cxx03/__algorithm/in_in_out_result.h index ba0380b5c6814..bb3a7e5466de0 100644 --- a/libcxx/include/__cxx03/__algorithm/in_in_out_result.h +++ b/libcxx/include/__cxx03/__algorithm/in_in_out_result.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___ALGORITHM_IN_IN_OUT_RESULT_H #define _LIBCPP___ALGORITHM_IN_IN_OUT_RESULT_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/in_in_result.h b/libcxx/include/__cxx03/__algorithm/in_in_result.h index 994573fc70fd8..12f1b572c5870 100644 --- a/libcxx/include/__cxx03/__algorithm/in_in_result.h +++ b/libcxx/include/__cxx03/__algorithm/in_in_result.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___ALGORITHM_IN_IN_RESULT_H #define _LIBCPP___ALGORITHM_IN_IN_RESULT_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/in_out_out_result.h b/libcxx/include/__cxx03/__algorithm/in_out_out_result.h index 8ceb452841a41..c7d18535d1014 100644 --- a/libcxx/include/__cxx03/__algorithm/in_out_out_result.h +++ b/libcxx/include/__cxx03/__algorithm/in_out_out_result.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___ALGORITHM_IN_OUT_OUT_RESULT_H #define _LIBCPP___ALGORITHM_IN_OUT_OUT_RESULT_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/in_out_result.h b/libcxx/include/__cxx03/__algorithm/in_out_result.h index a7a986cf8e6c0..66bd1a1666988 100644 --- a/libcxx/include/__cxx03/__algorithm/in_out_result.h +++ b/libcxx/include/__cxx03/__algorithm/in_out_result.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___ALGORITHM_IN_OUT_RESULT_H #define _LIBCPP___ALGORITHM_IN_OUT_RESULT_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/includes.h b/libcxx/include/__cxx03/__algorithm/includes.h index 62af03c374260..6d3eb44bb8c44 100644 --- a/libcxx/include/__cxx03/__algorithm/includes.h +++ b/libcxx/include/__cxx03/__algorithm/includes.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_INCLUDES_H #define _LIBCPP___ALGORITHM_INCLUDES_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_callable.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/inplace_merge.h b/libcxx/include/__cxx03/__algorithm/inplace_merge.h index a6bcc66a2fa47..3816ba5793c0d 100644 --- a/libcxx/include/__cxx03/__algorithm/inplace_merge.h +++ b/libcxx/include/__cxx03/__algorithm/inplace_merge.h @@ -9,32 +9,32 @@ #ifndef _LIBCPP___ALGORITHM_INPLACE_MERGE_H #define _LIBCPP___ALGORITHM_INPLACE_MERGE_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/lower_bound.h> -#include <__algorithm/min.h> -#include <__algorithm/move.h> -#include <__algorithm/rotate.h> -#include <__algorithm/upper_bound.h> -#include <__config> -#include <__functional/identity.h> -#include <__iterator/advance.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__memory/destruct_n.h> -#include <__memory/temporary_buffer.h> -#include <__memory/unique_ptr.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/move.h> +#include <__cxx03/__algorithm/rotate.h> +#include <__cxx03/__algorithm/upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__memory/destruct_n.h> +#include <__cxx03/__memory/temporary_buffer.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/is_heap.h b/libcxx/include/__cxx03/__algorithm/is_heap.h index c589b804a5dc0..dde4bfb6ff2db 100644 --- a/libcxx/include/__cxx03/__algorithm/is_heap.h +++ b/libcxx/include/__cxx03/__algorithm/is_heap.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_IS_HEAP_H #define _LIBCPP___ALGORITHM_IS_HEAP_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/is_heap_until.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/is_heap_until.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/is_heap_until.h b/libcxx/include/__cxx03/__algorithm/is_heap_until.h index a174f2453cfcc..d46dd343115c3 100644 --- a/libcxx/include/__cxx03/__algorithm/is_heap_until.h +++ b/libcxx/include/__cxx03/__algorithm/is_heap_until.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___ALGORITHM_IS_HEAP_UNTIL_H #define _LIBCPP___ALGORITHM_IS_HEAP_UNTIL_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/is_partitioned.h b/libcxx/include/__cxx03/__algorithm/is_partitioned.h index 1f7c8b0b267e7..8bd202f3e2429 100644 --- a/libcxx/include/__cxx03/__algorithm/is_partitioned.h +++ b/libcxx/include/__cxx03/__algorithm/is_partitioned.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_IS_PARTITIONED_H #define _LIBCPP___ALGORITHM_IS_PARTITIONED_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/is_permutation.h b/libcxx/include/__cxx03/__algorithm/is_permutation.h index 2ddfb32a212bb..0274ba98fcd16 100644 --- a/libcxx/include/__cxx03/__algorithm/is_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/is_permutation.h @@ -10,24 +10,24 @@ #ifndef _LIBCPP___ALGORITHM_IS_PERMUTATION_H #define _LIBCPP___ALGORITHM_IS_PERMUTATION_H -#include <__algorithm/comp.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__type_traits/is_callable.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/is_sorted.h b/libcxx/include/__cxx03/__algorithm/is_sorted.h index 3befb1ac9c26a..0a003a86b9913 100644 --- a/libcxx/include/__cxx03/__algorithm/is_sorted.h +++ b/libcxx/include/__cxx03/__algorithm/is_sorted.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_IS_SORTED_H #define _LIBCPP___ALGORITHM_IS_SORTED_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/is_sorted_until.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/is_sorted_until.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/is_sorted_until.h b/libcxx/include/__cxx03/__algorithm/is_sorted_until.h index 53a49f00de31e..eb90e5dd88e67 100644 --- a/libcxx/include/__cxx03/__algorithm/is_sorted_until.h +++ b/libcxx/include/__cxx03/__algorithm/is_sorted_until.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___ALGORITHM_IS_SORTED_UNTIL_H #define _LIBCPP___ALGORITHM_IS_SORTED_UNTIL_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/iter_swap.h b/libcxx/include/__cxx03/__algorithm/iter_swap.h index a1412e5d8720b..837a603d23e32 100644 --- a/libcxx/include/__cxx03/__algorithm/iter_swap.h +++ b/libcxx/include/__cxx03/__algorithm/iter_swap.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_ITER_SWAP_H #define _LIBCPP___ALGORITHM_ITER_SWAP_H -#include <__config> -#include <__utility/declval.h> -#include <__utility/swap.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/swap.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/iterator_operations.h b/libcxx/include/__cxx03/__algorithm/iterator_operations.h index 8ced989233bc4..b4f1f9e3ffca9 100644 --- a/libcxx/include/__cxx03/__algorithm/iterator_operations.h +++ b/libcxx/include/__cxx03/__algorithm/iterator_operations.h @@ -9,33 +9,33 @@ #ifndef _LIBCPP___ALGORITHM_ITERATOR_OPERATIONS_H #define _LIBCPP___ALGORITHM_ITERATOR_OPERATIONS_H -#include <__algorithm/iter_swap.h> -#include <__algorithm/ranges_iterator_concept.h> -#include <__assert> -#include <__config> -#include <__iterator/advance.h> -#include <__iterator/distance.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/prev.h> -#include <__iterator/readable_traits.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iter_swap.h> +#include <__cxx03/__algorithm/ranges_iterator_concept.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/prev.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/lexicographical_compare.h b/libcxx/include/__cxx03/__algorithm/lexicographical_compare.h index edc29e269c88c..e3d93be6177e7 100644 --- a/libcxx/include/__cxx03/__algorithm/lexicographical_compare.h +++ b/libcxx/include/__cxx03/__algorithm/lexicographical_compare.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___ALGORITHM_LEXICOGRAPHICAL_COMPARE_H #define _LIBCPP___ALGORITHM_LEXICOGRAPHICAL_COMPARE_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/lexicographical_compare_three_way.h b/libcxx/include/__cxx03/__algorithm/lexicographical_compare_three_way.h index a5872e90cf8d2..bea67a7937659 100644 --- a/libcxx/include/__cxx03/__algorithm/lexicographical_compare_three_way.h +++ b/libcxx/include/__cxx03/__algorithm/lexicographical_compare_three_way.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_LEXICOGRAPHICAL_COMPARE_THREE_WAY_H #define _LIBCPP___ALGORITHM_LEXICOGRAPHICAL_COMPARE_THREE_WAY_H -#include <__algorithm/min.h> -#include <__algorithm/three_way_comp_ref_type.h> -#include <__compare/compare_three_way.h> -#include <__compare/ordering.h> -#include <__concepts/arithmetic.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/common_type.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/three_way_comp_ref_type.h> +#include <__cxx03/__compare/compare_three_way.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/lower_bound.h b/libcxx/include/__cxx03/__algorithm/lower_bound.h index c417d84835497..844674cb78604 100644 --- a/libcxx/include/__cxx03/__algorithm/lower_bound.h +++ b/libcxx/include/__cxx03/__algorithm/lower_bound.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___ALGORITHM_LOWER_BOUND_H #define _LIBCPP___ALGORITHM_LOWER_BOUND_H -#include <__algorithm/comp.h> -#include <__algorithm/half_positive.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/advance.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_callable.h> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/half_positive.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/make_heap.h b/libcxx/include/__cxx03/__algorithm/make_heap.h index e8f0cdb27333a..5239a99083f50 100644 --- a/libcxx/include/__cxx03/__algorithm/make_heap.h +++ b/libcxx/include/__cxx03/__algorithm/make_heap.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___ALGORITHM_MAKE_HEAP_H #define _LIBCPP___ALGORITHM_MAKE_HEAP_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/sift_down.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/sift_down.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/make_projected.h b/libcxx/include/__cxx03/__algorithm/make_projected.h index 5245e523f3df2..8368e269385f4 100644 --- a/libcxx/include/__cxx03/__algorithm/make_projected.h +++ b/libcxx/include/__cxx03/__algorithm/make_projected.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___ALGORITHM_MAKE_PROJECTED_H #define _LIBCPP___ALGORITHM_MAKE_PROJECTED_H -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__type_traits/decay.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_member_pointer.h> -#include <__type_traits/is_same.h> -#include <__utility/declval.h> -#include <__utility/forward.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_member_pointer.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/max.h b/libcxx/include/__cxx03/__algorithm/max.h index d4c99f6f36436..983e81cd24bc4 100644 --- a/libcxx/include/__cxx03/__algorithm/max.h +++ b/libcxx/include/__cxx03/__algorithm/max.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___ALGORITHM_MAX_H #define _LIBCPP___ALGORITHM_MAX_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/max_element.h> -#include <__config> -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/max_element.h> +#include <__cxx03/__config> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/max_element.h b/libcxx/include/__cxx03/__algorithm/max_element.h index c036726cbccd8..686c116ab0105 100644 --- a/libcxx/include/__cxx03/__algorithm/max_element.h +++ b/libcxx/include/__cxx03/__algorithm/max_element.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___ALGORITHM_MAX_ELEMENT_H #define _LIBCPP___ALGORITHM_MAX_ELEMENT_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/merge.h b/libcxx/include/__cxx03/__algorithm/merge.h index bad663c4b9f10..41b0c9263e2fd 100644 --- a/libcxx/include/__cxx03/__algorithm/merge.h +++ b/libcxx/include/__cxx03/__algorithm/merge.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_MERGE_H #define _LIBCPP___ALGORITHM_MERGE_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/copy.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/min.h b/libcxx/include/__cxx03/__algorithm/min.h index 1bafad8a461eb..836f8f0028748 100644 --- a/libcxx/include/__cxx03/__algorithm/min.h +++ b/libcxx/include/__cxx03/__algorithm/min.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___ALGORITHM_MIN_H #define _LIBCPP___ALGORITHM_MIN_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/min_element.h> -#include <__config> -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/min_element.h> +#include <__cxx03/__config> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/min_element.h b/libcxx/include/__cxx03/__algorithm/min_element.h index 65f3594d630ce..d625ab0886cf7 100644 --- a/libcxx/include/__cxx03/__algorithm/min_element.h +++ b/libcxx/include/__cxx03/__algorithm/min_element.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_MIN_ELEMENT_H #define _LIBCPP___ALGORITHM_MIN_ELEMENT_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_callable.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/min_max_result.h b/libcxx/include/__cxx03/__algorithm/min_max_result.h index e988df7c114ee..2b56f88f1f9fe 100644 --- a/libcxx/include/__cxx03/__algorithm/min_max_result.h +++ b/libcxx/include/__cxx03/__algorithm/min_max_result.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___ALGORITHM_MIN_MAX_RESULT_H #define _LIBCPP___ALGORITHM_MIN_MAX_RESULT_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/minmax.h b/libcxx/include/__cxx03/__algorithm/minmax.h index 9feda2b4c0da9..a36970a0565fe 100644 --- a/libcxx/include/__cxx03/__algorithm/minmax.h +++ b/libcxx/include/__cxx03/__algorithm/minmax.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___ALGORITHM_MINMAX_H #define _LIBCPP___ALGORITHM_MINMAX_H -#include <__algorithm/comp.h> -#include <__algorithm/minmax_element.h> -#include <__config> -#include <__functional/identity.h> -#include <__type_traits/is_callable.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/minmax_element.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/minmax_element.h b/libcxx/include/__cxx03/__algorithm/minmax_element.h index 43cb23347c346..6298784092354 100644 --- a/libcxx/include/__cxx03/__algorithm/minmax_element.h +++ b/libcxx/include/__cxx03/__algorithm/minmax_element.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___ALGORITHM_MINMAX_ELEMENT_H #define _LIBCPP___ALGORITHM_MINMAX_ELEMENT_H -#include <__algorithm/comp.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_callable.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/mismatch.h b/libcxx/include/__cxx03/__algorithm/mismatch.h index 632bec02406a4..1eb9717c9ae6c 100644 --- a/libcxx/include/__cxx03/__algorithm/mismatch.h +++ b/libcxx/include/__cxx03/__algorithm/mismatch.h @@ -10,29 +10,29 @@ #ifndef _LIBCPP___ALGORITHM_MISMATCH_H #define _LIBCPP___ALGORITHM_MISMATCH_H -#include <__algorithm/comp.h> -#include <__algorithm/min.h> -#include <__algorithm/simd_utils.h> -#include <__algorithm/unwrap_iter.h> -#include <__config> -#include <__functional/identity.h> -#include <__iterator/aliasing_iterator.h> -#include <__type_traits/desugars_to.h> -#include <__type_traits/invoke.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_equality_comparable.h> -#include <__type_traits/is_integral.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include <__utility/unreachable.h> -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/simd_utils.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__iterator/aliasing_iterator.h> +#include <__cxx03/__type_traits/desugars_to.h> +#include <__cxx03/__type_traits/invoke.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_equality_comparable.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/move.h b/libcxx/include/__cxx03/__algorithm/move.h index 1716d43e2a613..11366afe99ffb 100644 --- a/libcxx/include/__cxx03/__algorithm/move.h +++ b/libcxx/include/__cxx03/__algorithm/move.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_MOVE_H #define _LIBCPP___ALGORITHM_MOVE_H -#include <__algorithm/copy_move_common.h> -#include <__algorithm/for_each_segment.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/min.h> -#include <__config> -#include <__iterator/segmented_iterator.h> -#include <__type_traits/common_type.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/copy_move_common.h> +#include <__cxx03/__algorithm/for_each_segment.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/move_backward.h b/libcxx/include/__cxx03/__algorithm/move_backward.h index 4beb7bdbaac0d..9d3e87bb9667e 100644 --- a/libcxx/include/__cxx03/__algorithm/move_backward.h +++ b/libcxx/include/__cxx03/__algorithm/move_backward.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_MOVE_BACKWARD_H #define _LIBCPP___ALGORITHM_MOVE_BACKWARD_H -#include <__algorithm/copy_move_common.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/min.h> -#include <__config> -#include <__iterator/segmented_iterator.h> -#include <__type_traits/common_type.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/copy_move_common.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/next_permutation.h b/libcxx/include/__cxx03/__algorithm/next_permutation.h index 011ee028cc2f5..02aa9113d1830 100644 --- a/libcxx/include/__cxx03/__algorithm/next_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/next_permutation.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_NEXT_PERMUTATION_H #define _LIBCPP___ALGORITHM_NEXT_PERMUTATION_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/reverse.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/reverse.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/none_of.h b/libcxx/include/__cxx03/__algorithm/none_of.h index 50841ba17cc63..e5d095da23482 100644 --- a/libcxx/include/__cxx03/__algorithm/none_of.h +++ b/libcxx/include/__cxx03/__algorithm/none_of.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___ALGORITHM_NONE_OF_H #define _LIBCPP___ALGORITHM_NONE_OF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/nth_element.h b/libcxx/include/__cxx03/__algorithm/nth_element.h index da748d7255aba..f840864c531c5 100644 --- a/libcxx/include/__cxx03/__algorithm/nth_element.h +++ b/libcxx/include/__cxx03/__algorithm/nth_element.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_NTH_ELEMENT_H #define _LIBCPP___ALGORITHM_NTH_ELEMENT_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/sort.h> -#include <__assert> -#include <__config> -#include <__debug_utils/randomize_range.h> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/sort.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__debug_utils/randomize_range.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/partial_sort.h b/libcxx/include/__cxx03/__algorithm/partial_sort.h index 7f8d0c49147e3..6984b1c91f201 100644 --- a/libcxx/include/__cxx03/__algorithm/partial_sort.h +++ b/libcxx/include/__cxx03/__algorithm/partial_sort.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_PARTIAL_SORT_H #define _LIBCPP___ALGORITHM_PARTIAL_SORT_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_heap.h> -#include <__algorithm/sift_down.h> -#include <__algorithm/sort_heap.h> -#include <__config> -#include <__debug_utils/randomize_range.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_heap.h> +#include <__cxx03/__algorithm/sift_down.h> +#include <__cxx03/__algorithm/sort_heap.h> +#include <__cxx03/__config> +#include <__cxx03/__debug_utils/randomize_range.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/partial_sort_copy.h b/libcxx/include/__cxx03/__algorithm/partial_sort_copy.h index ef7c9d34d9498..c8849b9f6175f 100644 --- a/libcxx/include/__cxx03/__algorithm/partial_sort_copy.h +++ b/libcxx/include/__cxx03/__algorithm/partial_sort_copy.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___ALGORITHM_PARTIAL_SORT_COPY_H #define _LIBCPP___ALGORITHM_PARTIAL_SORT_COPY_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_heap.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/sift_down.h> -#include <__algorithm/sort_heap.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_callable.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_heap.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/sift_down.h> +#include <__cxx03/__algorithm/sort_heap.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/partition.h b/libcxx/include/__cxx03/__algorithm/partition.h index 824e49b9ec214..5f26384b7c8ef 100644 --- a/libcxx/include/__cxx03/__algorithm/partition.h +++ b/libcxx/include/__cxx03/__algorithm/partition.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___ALGORITHM_PARTITION_H #define _LIBCPP___ALGORITHM_PARTITION_H -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/partition_copy.h b/libcxx/include/__cxx03/__algorithm/partition_copy.h index 147b45c7882a5..916a1c301d6d5 100644 --- a/libcxx/include/__cxx03/__algorithm/partition_copy.h +++ b/libcxx/include/__cxx03/__algorithm/partition_copy.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_PARTITION_COPY_H #define _LIBCPP___ALGORITHM_PARTITION_COPY_H -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/pair.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/partition_point.h b/libcxx/include/__cxx03/__algorithm/partition_point.h index 504dbf1d1a055..c28ee8cae0e6d 100644 --- a/libcxx/include/__cxx03/__algorithm/partition_point.h +++ b/libcxx/include/__cxx03/__algorithm/partition_point.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_PARTITION_POINT_H #define _LIBCPP___ALGORITHM_PARTITION_POINT_H -#include <__algorithm/half_positive.h> -#include <__config> -#include <__iterator/advance.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/half_positive.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/pop_heap.h b/libcxx/include/__cxx03/__algorithm/pop_heap.h index 6d23830097ff9..800cabf94eaa9 100644 --- a/libcxx/include/__cxx03/__algorithm/pop_heap.h +++ b/libcxx/include/__cxx03/__algorithm/pop_heap.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_POP_HEAP_H #define _LIBCPP___ALGORITHM_POP_HEAP_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/push_heap.h> -#include <__algorithm/sift_down.h> -#include <__assert> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/push_heap.h> +#include <__cxx03/__algorithm/sift_down.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/prev_permutation.h b/libcxx/include/__cxx03/__algorithm/prev_permutation.h index 8d15b6806401d..2569156eac642 100644 --- a/libcxx/include/__cxx03/__algorithm/prev_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/prev_permutation.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_PREV_PERMUTATION_H #define _LIBCPP___ALGORITHM_PREV_PERMUTATION_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/reverse.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/reverse.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/pstl.h b/libcxx/include/__cxx03/__algorithm/pstl.h index 0bb052b3f97c7..8dea15e09cb44 100644 --- a/libcxx/include/__cxx03/__algorithm/pstl.h +++ b/libcxx/include/__cxx03/__algorithm/pstl.h @@ -9,28 +9,28 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_H #define _LIBCPP___ALGORITHM_PSTL_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 -# include <__functional/operations.h> -# include <__iterator/cpp17_iterator_concepts.h> -# include <__iterator/iterator_traits.h> -# include <__pstl/backend.h> -# include <__pstl/dispatch.h> -# include <__pstl/handle_exception.h> -# include <__type_traits/enable_if.h> -# include <__type_traits/is_execution_policy.h> -# include <__type_traits/remove_cvref.h> -# include <__utility/forward.h> -# include <__utility/move.h> +# include <__cxx03/__functional/operations.h> +# include <__cxx03/__iterator/cpp17_iterator_concepts.h> +# include <__cxx03/__iterator/iterator_traits.h> +# include <__cxx03/__pstl/backend.h> +# include <__cxx03/__pstl/dispatch.h> +# include <__cxx03/__pstl/handle_exception.h> +# include <__cxx03/__type_traits/enable_if.h> +# include <__cxx03/__type_traits/is_execution_policy.h> +# include <__cxx03/__type_traits/remove_cvref.h> +# include <__cxx03/__utility/forward.h> +# include <__cxx03/__utility/move.h> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/push_heap.h b/libcxx/include/__cxx03/__algorithm/push_heap.h index ec0b445f2b70f..de4dcc33fb1fd 100644 --- a/libcxx/include/__cxx03/__algorithm/push_heap.h +++ b/libcxx/include/__cxx03/__algorithm/push_heap.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_PUSH_HEAP_H #define _LIBCPP___ALGORITHM_PUSH_HEAP_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/ranges_adjacent_find.h b/libcxx/include/__cxx03/__algorithm/ranges_adjacent_find.h index 3c54f723310a6..26caf4fe40ae0 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_adjacent_find.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_adjacent_find.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_ADJACENT_FIND_H #define _LIBCPP___ALGORITHM_RANGES_ADJACENT_FIND_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_all_of.h b/libcxx/include/__cxx03/__algorithm/ranges_all_of.h index 2f603b32f32d0..7e92c37b8fd99 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_all_of.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_all_of.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_ALL_OF_H #define _LIBCPP___ALGORITHM_RANGES_ALL_OF_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_any_of.h b/libcxx/include/__cxx03/__algorithm/ranges_any_of.h index 205fcecc086e7..d20177223fc6b 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_any_of.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_any_of.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_ANY_OF_H #define _LIBCPP___ALGORITHM_RANGES_ANY_OF_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_binary_search.h b/libcxx/include/__cxx03/__algorithm/ranges_binary_search.h index 1ef2bd62b5995..675cc1ac8074a 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_binary_search.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_binary_search.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_BINARY_SEARCH_H #define _LIBCPP___ALGORITHM_RANGES_BINARY_SEARCH_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/lower_bound.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_clamp.h b/libcxx/include/__cxx03/__algorithm/ranges_clamp.h index e6181ef9435e0..3f9f6b1f9944f 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_clamp.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_clamp.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_CLAMP_H #define _LIBCPP___ALGORITHM_RANGES_CLAMP_H -#include <__assert> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__utility/forward.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_contains.h b/libcxx/include/__cxx03/__algorithm/ranges_contains.h index 4836c3baed173..312de975efa5f 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_contains.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_contains.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_CONTAINS_H #define _LIBCPP___ALGORITHM_RANGES_CONTAINS_H -#include <__algorithm/ranges_find.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__functional/reference_wrapper.h> -#include <__iterator/concepts.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_find.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__functional/reference_wrapper.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_contains_subrange.h b/libcxx/include/__cxx03/__algorithm/ranges_contains_subrange.h index 4398c457fd054..d3302f1c915df 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_contains_subrange.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_contains_subrange.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_CONTAINS_SUBRANGE_H #define _LIBCPP___ALGORITHM_RANGES_CONTAINS_SUBRANGE_H -#include <__algorithm/ranges_search.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__functional/reference_wrapper.h> -#include <__iterator/concepts.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/size.h> -#include <__ranges/subrange.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_search.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__functional/reference_wrapper.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_copy.h b/libcxx/include/__cxx03/__algorithm/ranges_copy.h index e1d6d32f05f7e..cf3f9974936ad 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_copy.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_copy.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_COPY_H #define _LIBCPP___ALGORITHM_RANGES_COPY_H -#include <__algorithm/copy.h> -#include <__algorithm/in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__iterator/concepts.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_copy_backward.h b/libcxx/include/__cxx03/__algorithm/ranges_copy_backward.h index 93e326042503f..a6bb781397dec 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_copy_backward.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_copy_backward.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_COPY_BACKWARD_H #define _LIBCPP___ALGORITHM_RANGES_COPY_BACKWARD_H -#include <__algorithm/copy_backward.h> -#include <__algorithm/in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/copy_backward.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_copy_if.h b/libcxx/include/__cxx03/__algorithm/ranges_copy_if.h index 4b41d2154e7f8..f0d3005a4982b 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_copy_if.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_COPY_IF_H #define _LIBCPP___ALGORITHM_RANGES_COPY_IF_H -#include <__algorithm/in_out_result.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_copy_n.h b/libcxx/include/__cxx03/__algorithm/ranges_copy_n.h index 4353fa99278c8..92f10e7466e53 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_copy_n.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_copy_n.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_COPY_N_H #define _LIBCPP___ALGORITHM_RANGES_COPY_N_H -#include <__algorithm/copy.h> -#include <__algorithm/in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/ranges_copy.h> -#include <__config> -#include <__functional/identity.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/unreachable_sentinel.h> -#include <__iterator/wrap_iter.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/unreachable_sentinel.h> +#include <__cxx03/__iterator/wrap_iter.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/ranges_count.h b/libcxx/include/__cxx03/__algorithm/ranges_count.h index 4f35117438705..ae24b57c17d0a 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_count.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_count.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_COUNT_H #define _LIBCPP___ALGORITHM_RANGES_COUNT_H -#include <__algorithm/count.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/count.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_count_if.h b/libcxx/include/__cxx03/__algorithm/ranges_count_if.h index 5f2396ff7d531..ecadf6fac7128 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_count_if.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_count_if.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_COUNT_IF_H #define _LIBCPP___ALGORITHM_RANGES_COUNT_IF_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_ends_with.h b/libcxx/include/__cxx03/__algorithm/ranges_ends_with.h index 06efdef36b7cf..85329f8f4e58a 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_ends_with.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_ends_with.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_ENDS_WITH_H #define _LIBCPP___ALGORITHM_RANGES_ENDS_WITH_H -#include <__algorithm/ranges_equal.h> -#include <__algorithm/ranges_starts_with.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__functional/reference_wrapper.h> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/reverse_iterator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_equal.h> +#include <__cxx03/__algorithm/ranges_starts_with.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__functional/reference_wrapper.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_equal.h b/libcxx/include/__cxx03/__algorithm/ranges_equal.h index edbd0e3641c1b..b89235a299414 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_equal.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_equal.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_EQUAL_H #define _LIBCPP___ALGORITHM_RANGES_EQUAL_H -#include <__algorithm/equal.h> -#include <__algorithm/unwrap_range.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/indirectly_comparable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/unwrap_range.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_equal_range.h b/libcxx/include/__cxx03/__algorithm/ranges_equal_range.h index 4a308e016b546..d6a38e5edd312 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_equal_range.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_equal_range.h @@ -9,29 +9,29 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_EQUAL_RANGE_H #define _LIBCPP___ALGORITHM_RANGES_EQUAL_RANGE_H -#include <__algorithm/equal_range.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__ranges/subrange.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/equal_range.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_fill.h b/libcxx/include/__cxx03/__algorithm/ranges_fill.h index 7a177d85e9f07..56008ec7304da 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_fill.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_fill.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FILL_H #define _LIBCPP___ALGORITHM_RANGES_FILL_H -#include <__algorithm/ranges_fill_n.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> +#include <__cxx03/__algorithm/ranges_fill_n.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_fill_n.h b/libcxx/include/__cxx03/__algorithm/ranges_fill_n.h index a6e988c0089ce..bb00676943a6c 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_fill_n.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_fill_n.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FILL_N_H #define _LIBCPP___ALGORITHM_RANGES_FILL_N_H -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_find.h b/libcxx/include/__cxx03/__algorithm/ranges_find.h index 6b0d5efe37ab8..896fe920fd815 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_find.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_find.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FIND_H #define _LIBCPP___ALGORITHM_RANGES_FIND_H -#include <__algorithm/find.h> -#include <__algorithm/ranges_find_if.h> -#include <__algorithm/unwrap_range.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/find.h> +#include <__cxx03/__algorithm/ranges_find_if.h> +#include <__cxx03/__algorithm/unwrap_range.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_find_end.h b/libcxx/include/__cxx03/__algorithm/ranges_find_end.h index e49e66dd4ac04..c71f32546afb1 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_find_end.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_find_end.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FIND_END_H #define _LIBCPP___ALGORITHM_RANGES_FIND_END_H -#include <__algorithm/find_end.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/ranges_iterator_concept.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/subrange.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/find_end.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/ranges_iterator_concept.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_find_first_of.h b/libcxx/include/__cxx03/__algorithm/ranges_find_first_of.h index d92d9686bc442..23ae82eb4685f 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_find_first_of.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_find_first_of.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FIND_FIRST_OF_H #define _LIBCPP___ALGORITHM_RANGES_FIND_FIRST_OF_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/indirectly_comparable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_find_if.h b/libcxx/include/__cxx03/__algorithm/ranges_find_if.h index 888f9ec3cb2d5..a518f532a73e2 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_find_if.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_find_if.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FIND_IF_H #define _LIBCPP___ALGORITHM_RANGES_FIND_IF_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_find_if_not.h b/libcxx/include/__cxx03/__algorithm/ranges_find_if_not.h index ec19545b5a1b7..c54d565188c4f 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_find_if_not.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_find_if_not.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FIND_IF_NOT_H #define _LIBCPP___ALGORITHM_RANGES_FIND_IF_NOT_H -#include <__algorithm/ranges_find_if.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_find_if.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_find_last.h b/libcxx/include/__cxx03/__algorithm/ranges_find_last.h index 95f7e77b8ccbe..3028dc5bf53cb 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_find_last.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_find_last.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H #define _LIBCPP___ALGORITHM_RANGES_FIND_LAST_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/next.h> -#include <__iterator/prev.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/subrange.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/prev.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_for_each.h b/libcxx/include/__cxx03/__algorithm/ranges_for_each.h index 225dc774c8764..eca0f35e49fbd 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_for_each.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_for_each.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H -#include <__algorithm/in_fun_result.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_fun_result.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_for_each_n.h b/libcxx/include/__cxx03/__algorithm/ranges_for_each_n.h index d1fdca34cc5a1..fa3051fa8a0aa 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_for_each_n.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_for_each_n.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H -#include <__algorithm/in_fun_result.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_fun_result.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_generate.h b/libcxx/include/__cxx03/__algorithm/ranges_generate.h index e6467198e6ba2..570ed73a29550 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_generate.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_generate.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_GENERATE_H #define _LIBCPP___ALGORITHM_RANGES_GENERATE_H -#include <__concepts/constructible.h> -#include <__concepts/invocable.h> -#include <__config> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_generate_n.h b/libcxx/include/__cxx03/__algorithm/ranges_generate_n.h index cd5fd7483ab2c..b7c684be2e03f 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_generate_n.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_generate_n.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H #define _LIBCPP___ALGORITHM_RANGES_GENERATE_N_H -#include <__concepts/constructible.h> -#include <__concepts/invocable.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_includes.h b/libcxx/include/__cxx03/__algorithm/ranges_includes.h index c4c3b8ed088d3..3abe69118f20a 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_includes.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_includes.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_INCLUDES_H #define _LIBCPP___ALGORITHM_RANGES_INCLUDES_H -#include <__algorithm/includes.h> -#include <__algorithm/make_projected.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/includes.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_inplace_merge.h b/libcxx/include/__cxx03/__algorithm/ranges_inplace_merge.h index d94c0ad465677..ebb711d731209 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_inplace_merge.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_inplace_merge.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_INPLACE_MERGE_H #define _LIBCPP___ALGORITHM_RANGES_INPLACE_MERGE_H -#include <__algorithm/inplace_merge.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/inplace_merge.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_is_heap.h b/libcxx/include/__cxx03/__algorithm/ranges_is_heap.h index 3d9e18ce1d906..7c6b60a56f6c7 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_is_heap.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_is_heap.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_IS_HEAP_H #define _LIBCPP___ALGORITHM_RANGES_IS_HEAP_H -#include <__algorithm/is_heap_until.h> -#include <__algorithm/make_projected.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/is_heap_until.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_is_heap_until.h b/libcxx/include/__cxx03/__algorithm/ranges_is_heap_until.h index 7a2e1fc7705b6..e32e802f44732 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_is_heap_until.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_is_heap_until.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_IS_HEAP_UNTIL_H #define _LIBCPP___ALGORITHM_RANGES_IS_HEAP_UNTIL_H -#include <__algorithm/is_heap_until.h> -#include <__algorithm/make_projected.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/is_heap_until.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_is_partitioned.h b/libcxx/include/__cxx03/__algorithm/ranges_is_partitioned.h index 5be6fba46fd9e..87d59d0410fe5 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_is_partitioned.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_is_partitioned.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_IS_PARTITIONED_H #define _LIBCPP___ALGORITHM_RANGES_IS_PARTITIONED_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_is_permutation.h b/libcxx/include/__cxx03/__algorithm/ranges_is_permutation.h index 1f8d67007a573..a894854a8b65a 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_is_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_is_permutation.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_IS_PERMUTATION_H #define _LIBCPP___ALGORITHM_RANGES_IS_PERMUTATION_H -#include <__algorithm/is_permutation.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/is_permutation.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_is_sorted.h b/libcxx/include/__cxx03/__algorithm/ranges_is_sorted.h index 5b88d422b4b09..201673553fd4f 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_is_sorted.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_is_sorted.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP__ALGORITHM_RANGES_IS_SORTED_H #define _LIBCPP__ALGORITHM_RANGES_IS_SORTED_H -#include <__algorithm/ranges_is_sorted_until.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_is_sorted_until.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_is_sorted_until.h b/libcxx/include/__cxx03/__algorithm/ranges_is_sorted_until.h index 54de530c8b2fd..713d2664884ac 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_is_sorted_until.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_is_sorted_until.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP__ALGORITHM_RANGES_IS_SORTED_UNTIL_H #define _LIBCPP__ALGORITHM_RANGES_IS_SORTED_UNTIL_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_iterator_concept.h b/libcxx/include/__cxx03/__algorithm/ranges_iterator_concept.h index 2af891d3af005..eb2025c234ba1 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_iterator_concept.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_iterator_concept.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_ITERATOR_CONCEPT_H #define _LIBCPP___ALGORITHM_RANGES_ITERATOR_CONCEPT_H -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/remove_cvref.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/remove_cvref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_lexicographical_compare.h b/libcxx/include/__cxx03/__algorithm/ranges_lexicographical_compare.h index 6d82017e302a7..ba87a20b425b4 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_lexicographical_compare.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_lexicographical_compare.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_LEXICOGRAPHICAL_COMPARE_H #define _LIBCPP___ALGORITHM_RANGES_LEXICOGRAPHICAL_COMPARE_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_lower_bound.h b/libcxx/include/__cxx03/__algorithm/ranges_lower_bound.h index 0651147e04249..2c312866ee930 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_lower_bound.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_lower_bound.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_LOWER_BOUND_H #define _LIBCPP___ALGORITHM_RANGES_LOWER_BOUND_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/lower_bound.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_make_heap.h b/libcxx/include/__cxx03/__algorithm/ranges_make_heap.h index fe9c024fbf8a8..5a00575abbd34 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_make_heap.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_make_heap.h @@ -9,31 +9,31 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MAKE_HEAP_H #define _LIBCPP___ALGORITHM_RANGES_MAKE_HEAP_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_heap.h> -#include <__algorithm/make_projected.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_heap.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_max.h b/libcxx/include/__cxx03/__algorithm/ranges_max.h index d0ee6f314b0c3..d61a54fa7fff7 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_max.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_max.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MAX_H #define _LIBCPP___ALGORITHM_RANGES_MAX_H -#include <__algorithm/ranges_min_element.h> -#include <__assert> -#include <__concepts/copyable.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__type_traits/is_trivially_copyable.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/ranges_min_element.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -31,7 +31,7 @@ #if _LIBCPP_STD_VER >= 20 _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/ranges_max_element.h b/libcxx/include/__cxx03/__algorithm/ranges_max_element.h index c577309271165..6bcf77bc29d4d 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_max_element.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_max_element.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MAX_ELEMENT_H #define _LIBCPP___ALGORITHM_RANGES_MAX_ELEMENT_H -#include <__algorithm/ranges_min_element.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> +#include <__cxx03/__algorithm/ranges_min_element.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_merge.h b/libcxx/include/__cxx03/__algorithm/ranges_merge.h index bdf9a62d90bd2..697dfdae0b300 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_merge.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_merge.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MERGE_H #define _LIBCPP___ALGORITHM_RANGES_MERGE_H -#include <__algorithm/in_in_out_result.h> -#include <__algorithm/ranges_copy.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/mergeable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_in_out_result.h> +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/mergeable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_min.h b/libcxx/include/__cxx03/__algorithm/ranges_min.h index cc569d2a060c2..bcf0705071318 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_min.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_min.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MIN_H #define _LIBCPP___ALGORITHM_RANGES_MIN_H -#include <__algorithm/ranges_min_element.h> -#include <__assert> -#include <__concepts/copyable.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__type_traits/is_trivially_copyable.h> -#include +#include <__cxx03/__algorithm/ranges_min_element.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -30,7 +30,7 @@ #if _LIBCPP_STD_VER >= 20 _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/ranges_min_element.h b/libcxx/include/__cxx03/__algorithm/ranges_min_element.h index 588ef258e26f5..320a47a57ec6d 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_min_element.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_min_element.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MIN_ELEMENT_H #define _LIBCPP___ALGORITHM_RANGES_MIN_ELEMENT_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_minmax.h b/libcxx/include/__cxx03/__algorithm/ranges_minmax.h index 09cbefd91a8c7..8adf1af585518 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_minmax.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_minmax.h @@ -9,28 +9,28 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MINMAX_H #define _LIBCPP___ALGORITHM_RANGES_MINMAX_H -#include <__algorithm/min_max_result.h> -#include <__algorithm/minmax_element.h> -#include <__assert> -#include <__concepts/copyable.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__type_traits/desugars_to.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_trivially_copyable.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/min_max_result.h> +#include <__cxx03/__algorithm/minmax_element.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/desugars_to.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -39,7 +39,7 @@ #if _LIBCPP_STD_VER >= 20 _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/ranges_minmax_element.h b/libcxx/include/__cxx03/__algorithm/ranges_minmax_element.h index 4bf6d2404e463..ebc44082f4c3a 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_minmax_element.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_minmax_element.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MINMAX_ELEMENT_H #define _LIBCPP___ALGORITHM_RANGES_MINMAX_ELEMENT_H -#include <__algorithm/min_max_result.h> -#include <__algorithm/minmax_element.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/min_max_result.h> +#include <__cxx03/__algorithm/minmax_element.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_mismatch.h b/libcxx/include/__cxx03/__algorithm/ranges_mismatch.h index c4bf0022a9bcc..ccca6c7f6eb55 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_mismatch.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_mismatch.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MISMATCH_H #define _LIBCPP___ALGORITHM_RANGES_MISMATCH_H -#include <__algorithm/in_in_result.h> -#include <__algorithm/mismatch.h> -#include <__algorithm/unwrap_range.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/indirectly_comparable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_in_result.h> +#include <__cxx03/__algorithm/mismatch.h> +#include <__cxx03/__algorithm/unwrap_range.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/ranges_move.h b/libcxx/include/__cxx03/__algorithm/ranges_move.h index be869f36c9730..452608bef7456 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_move.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_move.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MOVE_H #define _LIBCPP___ALGORITHM_RANGES_MOVE_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/move.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/move.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_move_backward.h b/libcxx/include/__cxx03/__algorithm/ranges_move_backward.h index 6d4071a33b812..e353d9e2ffd10 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_move_backward.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_move_backward.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MOVE_BACKWARD_H #define _LIBCPP___ALGORITHM_RANGES_MOVE_BACKWARD_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/move_backward.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iter_move.h> -#include <__iterator/next.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/move_backward.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_next_permutation.h b/libcxx/include/__cxx03/__algorithm/ranges_next_permutation.h index 18535e0a6254a..b134cfca38195 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_next_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_next_permutation.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_NEXT_PERMUTATION_H #define _LIBCPP___ALGORITHM_RANGES_NEXT_PERMUTATION_H -#include <__algorithm/in_found_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/next_permutation.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/in_found_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/next_permutation.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_none_of.h b/libcxx/include/__cxx03/__algorithm/ranges_none_of.h index 7df3c1829fcfc..36d5477e14dee 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_none_of.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_none_of.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_NONE_OF_H #define _LIBCPP___ALGORITHM_RANGES_NONE_OF_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_nth_element.h b/libcxx/include/__cxx03/__algorithm/ranges_nth_element.h index 90ade9efe10da..df428390f7a3d 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_nth_element.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_nth_element.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_NTH_ELEMENT_H #define _LIBCPP___ALGORITHM_RANGES_NTH_ELEMENT_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/nth_element.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/nth_element.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_partial_sort.h b/libcxx/include/__cxx03/__algorithm/ranges_partial_sort.h index c67247d2e0a77..d19fc9b5297eb 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_partial_sort.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_partial_sort.h @@ -9,32 +9,32 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_PARTIAL_SORT_H #define _LIBCPP___ALGORITHM_RANGES_PARTIAL_SORT_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/partial_sort.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/partial_sort.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_partial_sort_copy.h b/libcxx/include/__cxx03/__algorithm/ranges_partial_sort_copy.h index b3bdeb78fb6f6..bd2be3cb2cf2d 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_partial_sort_copy.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_partial_sort_copy.h @@ -9,29 +9,29 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_PARTIAL_SORT_COPY_H #define _LIBCPP___ALGORITHM_RANGES_PARTIAL_SORT_COPY_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/partial_sort_copy.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/partial_sort_copy.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_partition.h b/libcxx/include/__cxx03/__algorithm/ranges_partition.h index a67ac4c967570..d6c147c441def 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_partition.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_partition.h @@ -9,31 +9,31 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_PARTITION_H #define _LIBCPP___ALGORITHM_RANGES_PARTITION_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/partition.h> -#include <__algorithm/ranges_iterator_concept.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/permutable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/subrange.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/partition.h> +#include <__cxx03/__algorithm/ranges_iterator_concept.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_partition_copy.h b/libcxx/include/__cxx03/__algorithm/ranges_partition_copy.h index d60c865dd2a8a..42cbb3a96a237 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_partition_copy.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_partition_copy.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_PARTITION_COPY_H #define _LIBCPP___ALGORITHM_RANGES_PARTITION_COPY_H -#include <__algorithm/in_out_out_result.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_out_result.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_partition_point.h b/libcxx/include/__cxx03/__algorithm/ranges_partition_point.h index c5b11b5fed192..60731b2ddcf3c 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_partition_point.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_partition_point.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_PARTITION_POINT_H #define _LIBCPP___ALGORITHM_RANGES_PARTITION_POINT_H -#include <__algorithm/half_positive.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/half_positive.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_pop_heap.h b/libcxx/include/__cxx03/__algorithm/ranges_pop_heap.h index 01f92c0f22888..d201d62c73575 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_pop_heap.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_pop_heap.h @@ -9,31 +9,31 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_POP_HEAP_H #define _LIBCPP___ALGORITHM_RANGES_POP_HEAP_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/pop_heap.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/pop_heap.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_prev_permutation.h b/libcxx/include/__cxx03/__algorithm/ranges_prev_permutation.h index 225cee9b75ec6..8d40d44e6ac1e 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_prev_permutation.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_prev_permutation.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_PREV_PERMUTATION_H #define _LIBCPP___ALGORITHM_RANGES_PREV_PERMUTATION_H -#include <__algorithm/in_found_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/prev_permutation.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/in_found_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/prev_permutation.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_push_heap.h b/libcxx/include/__cxx03/__algorithm/ranges_push_heap.h index 9d187af38c531..39d219e763dee 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_push_heap.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_push_heap.h @@ -9,31 +9,31 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_PUSH_HEAP_H #define _LIBCPP___ALGORITHM_RANGES_PUSH_HEAP_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/push_heap.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/push_heap.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_remove.h b/libcxx/include/__cxx03/__algorithm/ranges_remove.h index 17c3a2c5cd06b..d766a7b4a846f 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_remove.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_remove.h @@ -8,25 +8,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REMOVE_H #define _LIBCPP___ALGORITHM_RANGES_REMOVE_H -#include <__config> +#include <__cxx03/__config> -#include <__algorithm/ranges_remove_if.h> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/permutable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/subrange.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_remove_if.h> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_remove_copy.h b/libcxx/include/__cxx03/__algorithm/ranges_remove_copy.h index 84529eceac68c..fdb04cd96cf89 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_remove_copy.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_remove_copy.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REMOVE_COPY_H #define _LIBCPP___ALGORITHM_RANGES_REMOVE_COPY_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/ranges_remove_copy_if.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/ranges_remove_copy_if.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_remove_copy_if.h b/libcxx/include/__cxx03/__algorithm/ranges_remove_copy_if.h index 56fe017533120..dd643a581a1d2 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_remove_copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_remove_copy_if.h @@ -9,28 +9,28 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REMOVE_COPY_IF_H #define _LIBCPP___ALGORITHM_RANGES_REMOVE_COPY_IF_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/remove_copy_if.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/remove_copy_if.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_remove_if.h b/libcxx/include/__cxx03/__algorithm/ranges_remove_if.h index 0ea5d9a01b881..9d4ccd0f0b52d 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_remove_if.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_remove_if.h @@ -8,27 +8,27 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REMOVE_IF_H #define _LIBCPP___ALGORITHM_RANGES_REMOVE_IF_H -#include <__config> - -#include <__algorithm/ranges_find_if.h> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iter_move.h> -#include <__iterator/permutable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/subrange.h> -#include <__utility/move.h> +#include <__cxx03/__config> + +#include <__cxx03/__algorithm/ranges_find_if.h> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_replace.h b/libcxx/include/__cxx03/__algorithm/ranges_replace.h index 2b88dc032972f..774d7632e41cf 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_replace.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_replace.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REPLACE_H #define _LIBCPP___ALGORITHM_RANGES_REPLACE_H -#include <__algorithm/ranges_replace_if.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_replace_if.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_replace_copy.h b/libcxx/include/__cxx03/__algorithm/ranges_replace_copy.h index 633f993e5c948..d90715d3661ba 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_replace_copy.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_replace_copy.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REPLACE_COPY_H #define _LIBCPP___ALGORITHM_RANGES_REPLACE_COPY_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/ranges_replace_copy_if.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/ranges_replace_copy_if.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_replace_copy_if.h b/libcxx/include/__cxx03/__algorithm/ranges_replace_copy_if.h index e065c3ac0acc9..88e5ff71c58c5 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_replace_copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_replace_copy_if.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REPLACE_COPY_IF_H #define _LIBCPP___ALGORITHM_RANGES_REPLACE_COPY_IF_H -#include <__algorithm/in_out_result.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_replace_if.h b/libcxx/include/__cxx03/__algorithm/ranges_replace_if.h index 6445f42aea190..6d9ff45a59c97 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_replace_if.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_replace_if.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REPLACE_IF_H #define _LIBCPP___ALGORITHM_RANGES_REPLACE_IF_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_reverse.h b/libcxx/include/__cxx03/__algorithm/ranges_reverse.h index 9ec865995b4a5..78614666200ef 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_reverse.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_reverse.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REVERSE_H #define _LIBCPP___ALGORITHM_RANGES_REVERSE_H -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iter_swap.h> -#include <__iterator/next.h> -#include <__iterator/permutable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/ranges_reverse_copy.h b/libcxx/include/__cxx03/__algorithm/ranges_reverse_copy.h index 60043787a7170..7aab06d721430 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_reverse_copy.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_reverse_copy.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_REVERSE_COPY_H #define _LIBCPP___ALGORITHM_RANGES_REVERSE_COPY_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/ranges_copy.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/next.h> -#include <__iterator/reverse_iterator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__ranges/subrange.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_rotate.h b/libcxx/include/__cxx03/__algorithm/ranges_rotate.h index 8d33a6f0799bf..2efa88748ff70 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_rotate.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_rotate.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_ROTATE_H #define _LIBCPP___ALGORITHM_RANGES_ROTATE_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/ranges_iterator_concept.h> -#include <__algorithm/rotate.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/permutable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/subrange.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/ranges_iterator_concept.h> +#include <__cxx03/__algorithm/rotate.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_rotate_copy.h b/libcxx/include/__cxx03/__algorithm/ranges_rotate_copy.h index 26fe110b53896..7aa1fa13feddd 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_rotate_copy.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_rotate_copy.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_ROTATE_COPY_H #define _LIBCPP___ALGORITHM_RANGES_ROTATE_COPY_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/ranges_copy.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_sample.h b/libcxx/include/__cxx03/__algorithm/ranges_sample.h index e4f60a7b66be2..08a44d3a3c855 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_sample.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_sample.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SAMPLE_H #define _LIBCPP___ALGORITHM_RANGES_SAMPLE_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/sample.h> -#include <__algorithm/uniform_random_bit_generator_adaptor.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__random/uniform_random_bit_generator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__type_traits/remove_reference.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/sample.h> +#include <__cxx03/__algorithm/uniform_random_bit_generator_adaptor.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__random/uniform_random_bit_generator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_search.h b/libcxx/include/__cxx03/__algorithm/ranges_search.h index 55294c60631b1..68ac1fce9e7bf 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_search.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_search.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SEARCH_H #define _LIBCPP___ALGORITHM_RANGES_SEARCH_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/search.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/indirectly_comparable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/size.h> -#include <__ranges/subrange.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/search.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/ranges_search_n.h b/libcxx/include/__cxx03/__algorithm/ranges_search_n.h index 56e12755b9bf6..c7c670f02c20c 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_search_n.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_search_n.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SEARCH_N_H #define _LIBCPP___ALGORITHM_RANGES_SEARCH_N_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/search_n.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/size.h> -#include <__ranges/subrange.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/search_n.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_set_difference.h b/libcxx/include/__cxx03/__algorithm/ranges_set_difference.h index 0841fb4ffd0c0..764384b970833 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_set_difference.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_set_difference.h @@ -9,29 +9,29 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SET_DIFFERENCE_H #define _LIBCPP___ALGORITHM_RANGES_SET_DIFFERENCE_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/set_difference.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/mergeable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__type_traits/decay.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/set_difference.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/mergeable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_set_intersection.h b/libcxx/include/__cxx03/__algorithm/ranges_set_intersection.h index 9427379745b60..4aebb5e195da2 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_set_intersection.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_set_intersection.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SET_INTERSECTION_H #define _LIBCPP___ALGORITHM_RANGES_SET_INTERSECTION_H -#include <__algorithm/in_in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/set_intersection.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/mergeable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/set_intersection.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/mergeable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_set_symmetric_difference.h b/libcxx/include/__cxx03/__algorithm/ranges_set_symmetric_difference.h index 995eb0999d940..50ec574fcc301 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_set_symmetric_difference.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_set_symmetric_difference.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SET_SYMMETRIC_DIFFERENCE_H #define _LIBCPP___ALGORITHM_RANGES_SET_SYMMETRIC_DIFFERENCE_H -#include <__algorithm/in_in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/set_symmetric_difference.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/mergeable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/set_symmetric_difference.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/mergeable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_set_union.h b/libcxx/include/__cxx03/__algorithm/ranges_set_union.h index e870e390cc665..87d08e4cfedcc 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_set_union.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_set_union.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SET_UNION_H #define _LIBCPP___ALGORITHM_RANGES_SET_UNION_H -#include <__algorithm/in_in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/set_union.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/mergeable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/set_union.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/mergeable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_shuffle.h b/libcxx/include/__cxx03/__algorithm/ranges_shuffle.h index ab98ea22caabe..44e3a73e1b4b7 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_shuffle.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_shuffle.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SHUFFLE_H #define _LIBCPP___ALGORITHM_RANGES_SHUFFLE_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/shuffle.h> -#include <__algorithm/uniform_random_bit_generator_adaptor.h> -#include <__config> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/permutable.h> -#include <__random/uniform_random_bit_generator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__type_traits/remove_reference.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/shuffle.h> +#include <__cxx03/__algorithm/uniform_random_bit_generator_adaptor.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__random/uniform_random_bit_generator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_sort.h b/libcxx/include/__cxx03/__algorithm/ranges_sort.h index 0296c146b3ede..ad24cea344e8b 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_sort.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_sort.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SORT_H #define _LIBCPP___ALGORITHM_RANGES_SORT_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/sort.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/sort.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_sort_heap.h b/libcxx/include/__cxx03/__algorithm/ranges_sort_heap.h index bab30df1708c7..403917b032ddc 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_sort_heap.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_sort_heap.h @@ -9,31 +9,31 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SORT_HEAP_H #define _LIBCPP___ALGORITHM_RANGES_SORT_HEAP_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/sort_heap.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/sort_heap.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_stable_partition.h b/libcxx/include/__cxx03/__algorithm/ranges_stable_partition.h index f34027ff772c7..cd4ae07bf9c70 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_stable_partition.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_stable_partition.h @@ -9,33 +9,33 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_STABLE_PARTITION_H #define _LIBCPP___ALGORITHM_RANGES_STABLE_PARTITION_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/ranges_iterator_concept.h> -#include <__algorithm/stable_partition.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/permutable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__ranges/subrange.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/ranges_iterator_concept.h> +#include <__cxx03/__algorithm/stable_partition.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_stable_sort.h b/libcxx/include/__cxx03/__algorithm/ranges_stable_sort.h index 93909e253cc0f..20c83283b9606 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_stable_sort.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_stable_sort.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_STABLE_SORT_H #define _LIBCPP___ALGORITHM_RANGES_STABLE_SORT_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/stable_sort.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/projected.h> -#include <__iterator/sortable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/stable_sort.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__iterator/sortable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_starts_with.h b/libcxx/include/__cxx03/__algorithm/ranges_starts_with.h index 17084e4f24336..0e2424a0d8a03 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_starts_with.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_starts_with.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_STARTS_WITH_H #define _LIBCPP___ALGORITHM_RANGES_STARTS_WITH_H -#include <__algorithm/in_in_result.h> -#include <__algorithm/ranges_mismatch.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/indirectly_comparable.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_in_result.h> +#include <__cxx03/__algorithm/ranges_mismatch.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_swap_ranges.h b/libcxx/include/__cxx03/__algorithm/ranges_swap_ranges.h index b6d9f618395a5..610e7c315486b 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_swap_ranges.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_swap_ranges.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_SWAP_RANGES_H #define _LIBCPP___ALGORITHM_RANGES_SWAP_RANGES_H -#include <__algorithm/in_in_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/swap_ranges.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iter_swap.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_in_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/swap_ranges.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_transform.h b/libcxx/include/__cxx03/__algorithm/ranges_transform.h index 7850ec4f84656..12e4a50154aae 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_transform.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_transform.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_TRANSFORM_H #define _LIBCPP___ALGORITHM_RANGES_TRANSFORM_H -#include <__algorithm/in_in_out_result.h> -#include <__algorithm/in_out_result.h> -#include <__concepts/constructible.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/in_in_out_result.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_unique.h b/libcxx/include/__cxx03/__algorithm/ranges_unique.h index 7a9b784321873..0893127dd9d17 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_unique.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_unique.h @@ -9,31 +9,31 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_UNIQUE_H #define _LIBCPP___ALGORITHM_RANGES_UNIQUE_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/unique.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/permutable.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__ranges/subrange.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/unique.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_unique_copy.h b/libcxx/include/__cxx03/__algorithm/ranges_unique_copy.h index 61133885ae809..c30332a217f7e 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_unique_copy.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_unique_copy.h @@ -9,31 +9,31 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_UNIQUE_COPY_H #define _LIBCPP___ALGORITHM_RANGES_UNIQUE_COPY_H -#include <__algorithm/in_out_result.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/make_projected.h> -#include <__algorithm/unique_copy.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/make_projected.h> +#include <__cxx03/__algorithm/unique_copy.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__algorithm/ranges_upper_bound.h b/libcxx/include/__cxx03/__algorithm/ranges_upper_bound.h index fa6fa7f70ed5a..79b49ab804b3b 100644 --- a/libcxx/include/__cxx03/__algorithm/ranges_upper_bound.h +++ b/libcxx/include/__cxx03/__algorithm/ranges_upper_bound.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_UPPER_BOUND_H #define _LIBCPP___ALGORITHM_RANGES_UPPER_BOUND_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/lower_bound.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/remove.h b/libcxx/include/__cxx03/__algorithm/remove.h index fd01c23cb6708..7849527585b17 100644 --- a/libcxx/include/__cxx03/__algorithm/remove.h +++ b/libcxx/include/__cxx03/__algorithm/remove.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___ALGORITHM_REMOVE_H #define _LIBCPP___ALGORITHM_REMOVE_H -#include <__algorithm/find.h> -#include <__algorithm/find_if.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__algorithm/find.h> +#include <__cxx03/__algorithm/find_if.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/remove_copy.h b/libcxx/include/__cxx03/__algorithm/remove_copy.h index 7be4c166ce3d7..e79c798127161 100644 --- a/libcxx/include/__cxx03/__algorithm/remove_copy.h +++ b/libcxx/include/__cxx03/__algorithm/remove_copy.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_REMOVE_COPY_H #define _LIBCPP___ALGORITHM_REMOVE_COPY_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/remove_copy_if.h b/libcxx/include/__cxx03/__algorithm/remove_copy_if.h index dcafed169157d..7132e3c0bdb1a 100644 --- a/libcxx/include/__cxx03/__algorithm/remove_copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/remove_copy_if.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_REMOVE_COPY_IF_H #define _LIBCPP___ALGORITHM_REMOVE_COPY_IF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/remove_if.h b/libcxx/include/__cxx03/__algorithm/remove_if.h index b14f3c0efa7e9..e6dc3d15fbc9c 100644 --- a/libcxx/include/__cxx03/__algorithm/remove_if.h +++ b/libcxx/include/__cxx03/__algorithm/remove_if.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___ALGORITHM_REMOVE_IF_H #define _LIBCPP___ALGORITHM_REMOVE_IF_H -#include <__algorithm/find_if.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__algorithm/find_if.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/replace.h b/libcxx/include/__cxx03/__algorithm/replace.h index 8057c78686e11..2b24752f048ef 100644 --- a/libcxx/include/__cxx03/__algorithm/replace.h +++ b/libcxx/include/__cxx03/__algorithm/replace.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_REPLACE_H #define _LIBCPP___ALGORITHM_REPLACE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/replace_copy.h b/libcxx/include/__cxx03/__algorithm/replace_copy.h index 9a2258d9f58ed..6c50dd4e14f57 100644 --- a/libcxx/include/__cxx03/__algorithm/replace_copy.h +++ b/libcxx/include/__cxx03/__algorithm/replace_copy.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_REPLACE_COPY_H #define _LIBCPP___ALGORITHM_REPLACE_COPY_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/replace_copy_if.h b/libcxx/include/__cxx03/__algorithm/replace_copy_if.h index c2ed30f08d598..c714d50037339 100644 --- a/libcxx/include/__cxx03/__algorithm/replace_copy_if.h +++ b/libcxx/include/__cxx03/__algorithm/replace_copy_if.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_REPLACE_COPY_IF_H #define _LIBCPP___ALGORITHM_REPLACE_COPY_IF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/replace_if.h b/libcxx/include/__cxx03/__algorithm/replace_if.h index 78487e3deed70..8bd9a9c0db979 100644 --- a/libcxx/include/__cxx03/__algorithm/replace_if.h +++ b/libcxx/include/__cxx03/__algorithm/replace_if.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_REPLACE_IF_H #define _LIBCPP___ALGORITHM_REPLACE_IF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/reverse.h b/libcxx/include/__cxx03/__algorithm/reverse.h index 4167c9116d96e..b97ec38490a22 100644 --- a/libcxx/include/__cxx03/__algorithm/reverse.h +++ b/libcxx/include/__cxx03/__algorithm/reverse.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___ALGORITHM_REVERSE_H #define _LIBCPP___ALGORITHM_REVERSE_H -#include <__algorithm/iter_swap.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iter_swap.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/reverse_copy.h b/libcxx/include/__cxx03/__algorithm/reverse_copy.h index 0fcecc3923268..29070fec8448c 100644 --- a/libcxx/include/__cxx03/__algorithm/reverse_copy.h +++ b/libcxx/include/__cxx03/__algorithm/reverse_copy.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_REVERSE_COPY_H #define _LIBCPP___ALGORITHM_REVERSE_COPY_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/rotate.h b/libcxx/include/__cxx03/__algorithm/rotate.h index df4ca95aac95b..d6ca9230731ed 100644 --- a/libcxx/include/__cxx03/__algorithm/rotate.h +++ b/libcxx/include/__cxx03/__algorithm/rotate.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_ROTATE_H #define _LIBCPP___ALGORITHM_ROTATE_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/move.h> -#include <__algorithm/move_backward.h> -#include <__algorithm/swap_ranges.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_trivially_assignable.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/move.h> +#include <__cxx03/__algorithm/move_backward.h> +#include <__cxx03/__algorithm/swap_ranges.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_trivially_assignable.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/rotate_copy.h b/libcxx/include/__cxx03/__algorithm/rotate_copy.h index cddcadd237d90..c20d9571e6ffd 100644 --- a/libcxx/include/__cxx03/__algorithm/rotate_copy.h +++ b/libcxx/include/__cxx03/__algorithm/rotate_copy.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___ALGORITHM_ROTATE_COPY_H #define _LIBCPP___ALGORITHM_ROTATE_COPY_H -#include <__algorithm/copy.h> -#include <__config> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/sample.h b/libcxx/include/__cxx03/__algorithm/sample.h index ebe5180b7eeca..e6743cf3828c0 100644 --- a/libcxx/include/__cxx03/__algorithm/sample.h +++ b/libcxx/include/__cxx03/__algorithm/sample.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_SAMPLE_H #define _LIBCPP___ALGORITHM_SAMPLE_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/min.h> -#include <__assert> -#include <__config> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__random/uniform_int_distribution.h> -#include <__type_traits/common_type.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__random/uniform_int_distribution.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/search.h b/libcxx/include/__cxx03/__algorithm/search.h index b82ca78095354..eb862f4ad7ca9 100644 --- a/libcxx/include/__cxx03/__algorithm/search.h +++ b/libcxx/include/__cxx03/__algorithm/search.h @@ -10,17 +10,17 @@ #ifndef _LIBCPP___ALGORITHM_SEARCH_H #define _LIBCPP___ALGORITHM_SEARCH_H -#include <__algorithm/comp.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_callable.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/search_n.h b/libcxx/include/__cxx03/__algorithm/search_n.h index 771647d3168a4..d6458b8ca90d8 100644 --- a/libcxx/include/__cxx03/__algorithm/search_n.h +++ b/libcxx/include/__cxx03/__algorithm/search_n.h @@ -10,19 +10,19 @@ #ifndef _LIBCPP___ALGORITHM_SEARCH_N_H #define _LIBCPP___ALGORITHM_SEARCH_N_H -#include <__algorithm/comp.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/concepts.h> -#include <__type_traits/is_callable.h> -#include <__utility/convert_to_integral.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/is_callable.h> +#include <__cxx03/__utility/convert_to_integral.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/set_difference.h b/libcxx/include/__cxx03/__algorithm/set_difference.h index f414bcecb50df..13d6f0249e436 100644 --- a/libcxx/include/__cxx03/__algorithm/set_difference.h +++ b/libcxx/include/__cxx03/__algorithm/set_difference.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_SET_DIFFERENCE_H #define _LIBCPP___ALGORITHM_SET_DIFFERENCE_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/copy.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/set_intersection.h b/libcxx/include/__cxx03/__algorithm/set_intersection.h index bb0d86cd0f58d..f1193ec349cfd 100644 --- a/libcxx/include/__cxx03/__algorithm/set_intersection.h +++ b/libcxx/include/__cxx03/__algorithm/set_intersection.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_SET_INTERSECTION_H #define _LIBCPP___ALGORITHM_SET_INTERSECTION_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/lower_bound.h> -#include <__config> -#include <__functional/identity.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__type_traits/is_same.h> -#include <__utility/exchange.h> -#include <__utility/move.h> -#include <__utility/swap.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/exchange.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/set_symmetric_difference.h b/libcxx/include/__cxx03/__algorithm/set_symmetric_difference.h index db36665a61365..71aca2b8ed03b 100644 --- a/libcxx/include/__cxx03/__algorithm/set_symmetric_difference.h +++ b/libcxx/include/__cxx03/__algorithm/set_symmetric_difference.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_SET_SYMMETRIC_DIFFERENCE_H #define _LIBCPP___ALGORITHM_SET_SYMMETRIC_DIFFERENCE_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/copy.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/set_union.h b/libcxx/include/__cxx03/__algorithm/set_union.h index a79c50fd3cf2f..f60221cce3cd8 100644 --- a/libcxx/include/__cxx03/__algorithm/set_union.h +++ b/libcxx/include/__cxx03/__algorithm/set_union.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_SET_UNION_H #define _LIBCPP___ALGORITHM_SET_UNION_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/copy.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/shift_left.h b/libcxx/include/__cxx03/__algorithm/shift_left.h index 06cd7c5f87644..b59a069826710 100644 --- a/libcxx/include/__cxx03/__algorithm/shift_left.h +++ b/libcxx/include/__cxx03/__algorithm/shift_left.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___ALGORITHM_SHIFT_LEFT_H #define _LIBCPP___ALGORITHM_SHIFT_LEFT_H -#include <__algorithm/move.h> -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__algorithm/move.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/shift_right.h b/libcxx/include/__cxx03/__algorithm/shift_right.h index 01853057fc478..51d8ea613245d 100644 --- a/libcxx/include/__cxx03/__algorithm/shift_right.h +++ b/libcxx/include/__cxx03/__algorithm/shift_right.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___ALGORITHM_SHIFT_RIGHT_H #define _LIBCPP___ALGORITHM_SHIFT_RIGHT_H -#include <__algorithm/move.h> -#include <__algorithm/move_backward.h> -#include <__algorithm/swap_ranges.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/swap.h> +#include <__cxx03/__algorithm/move.h> +#include <__cxx03/__algorithm/move_backward.h> +#include <__cxx03/__algorithm/swap_ranges.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/swap.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/shuffle.h b/libcxx/include/__cxx03/__algorithm/shuffle.h index c9c56ce8c2c0b..30b372ffe767a 100644 --- a/libcxx/include/__cxx03/__algorithm/shuffle.h +++ b/libcxx/include/__cxx03/__algorithm/shuffle.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_SHUFFLE_H #define _LIBCPP___ALGORITHM_SHUFFLE_H -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__random/uniform_int_distribution.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include -#include +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__random/uniform_int_distribution.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/sift_down.h b/libcxx/include/__cxx03/__algorithm/sift_down.h index 42803e30631fb..85f0aa92c568d 100644 --- a/libcxx/include/__cxx03/__algorithm/sift_down.h +++ b/libcxx/include/__cxx03/__algorithm/sift_down.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___ALGORITHM_SIFT_DOWN_H #define _LIBCPP___ALGORITHM_SIFT_DOWN_H -#include <__algorithm/iterator_operations.h> -#include <__assert> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/simd_utils.h b/libcxx/include/__cxx03/__algorithm/simd_utils.h index 549197be80183..4769a8176a885 100644 --- a/libcxx/include/__cxx03/__algorithm/simd_utils.h +++ b/libcxx/include/__cxx03/__algorithm/simd_utils.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_SIMD_UTILS_H #define _LIBCPP___ALGORITHM_SIMD_UTILS_H -#include <__algorithm/min.h> -#include <__bit/bit_cast.h> -#include <__bit/countl.h> -#include <__bit/countr.h> -#include <__config> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_same.h> -#include <__utility/integer_sequence.h> -#include -#include +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__bit/bit_cast.h> +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__bit/countr.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> // TODO: Find out how altivec changes things and allow vectorizations there too. #if _LIBCPP_STD_VER >= 14 && defined(_LIBCPP_CLANG_VER) && !defined(__ALTIVEC__) diff --git a/libcxx/include/__cxx03/__algorithm/sort.h b/libcxx/include/__cxx03/__algorithm/sort.h index 07b5814639e9e..d14ec87b4aea8 100644 --- a/libcxx/include/__cxx03/__algorithm/sort.h +++ b/libcxx/include/__cxx03/__algorithm/sort.h @@ -9,38 +9,38 @@ #ifndef _LIBCPP___ALGORITHM_SORT_H #define _LIBCPP___ALGORITHM_SORT_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iter_swap.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/min_element.h> -#include <__algorithm/partial_sort.h> -#include <__algorithm/unwrap_iter.h> -#include <__assert> -#include <__bit/blsr.h> -#include <__bit/countl.h> -#include <__bit/countr.h> -#include <__config> -#include <__debug_utils/randomize_range.h> -#include <__debug_utils/strict_weak_ordering_check.h> -#include <__functional/operations.h> -#include <__functional/ranges_operations.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/conditional.h> -#include <__type_traits/disjunction.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iter_swap.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/min_element.h> +#include <__cxx03/__algorithm/partial_sort.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__assert> +#include <__cxx03/__bit/blsr.h> +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__bit/countr.h> +#include <__cxx03/__config> +#include <__cxx03/__debug_utils/randomize_range.h> +#include <__cxx03/__debug_utils/strict_weak_ordering_check.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/disjunction.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/climits> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/sort_heap.h b/libcxx/include/__cxx03/__algorithm/sort_heap.h index f20b110c7fd12..c2d218b8f5e0d 100644 --- a/libcxx/include/__cxx03/__algorithm/sort_heap.h +++ b/libcxx/include/__cxx03/__algorithm/sort_heap.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ALGORITHM_SORT_HEAP_H #define _LIBCPP___ALGORITHM_SORT_HEAP_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/pop_heap.h> -#include <__config> -#include <__debug_utils/strict_weak_ordering_check.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/pop_heap.h> +#include <__cxx03/__config> +#include <__cxx03/__debug_utils/strict_weak_ordering_check.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/stable_partition.h b/libcxx/include/__cxx03/__algorithm/stable_partition.h index 8bb1eaf2d2249..cea18b30fbb37 100644 --- a/libcxx/include/__cxx03/__algorithm/stable_partition.h +++ b/libcxx/include/__cxx03/__algorithm/stable_partition.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___ALGORITHM_STABLE_PARTITION_H #define _LIBCPP___ALGORITHM_STABLE_PARTITION_H -#include <__algorithm/iterator_operations.h> -#include <__algorithm/rotate.h> -#include <__config> -#include <__iterator/advance.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__memory/destruct_n.h> -#include <__memory/temporary_buffer.h> -#include <__memory/unique_ptr.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/rotate.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/destruct_n.h> +#include <__cxx03/__memory/temporary_buffer.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/stable_sort.h b/libcxx/include/__cxx03/__algorithm/stable_sort.h index 726e7e16b3564..6a3cef7ec1b66 100644 --- a/libcxx/include/__cxx03/__algorithm/stable_sort.h +++ b/libcxx/include/__cxx03/__algorithm/stable_sort.h @@ -9,28 +9,28 @@ #ifndef _LIBCPP___ALGORITHM_STABLE_SORT_H #define _LIBCPP___ALGORITHM_STABLE_SORT_H -#include <__algorithm/comp.h> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/inplace_merge.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/sort.h> -#include <__config> -#include <__debug_utils/strict_weak_ordering_check.h> -#include <__iterator/iterator_traits.h> -#include <__memory/destruct_n.h> -#include <__memory/temporary_buffer.h> -#include <__memory/unique_ptr.h> -#include <__type_traits/is_trivially_assignable.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/inplace_merge.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/sort.h> +#include <__cxx03/__config> +#include <__cxx03/__debug_utils/strict_weak_ordering_check.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/destruct_n.h> +#include <__cxx03/__memory/temporary_buffer.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__type_traits/is_trivially_assignable.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/swap_ranges.h b/libcxx/include/__cxx03/__algorithm/swap_ranges.h index 54b453b72360e..12bc8056e3577 100644 --- a/libcxx/include/__cxx03/__algorithm/swap_ranges.h +++ b/libcxx/include/__cxx03/__algorithm/swap_ranges.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___ALGORITHM_SWAP_RANGES_H #define _LIBCPP___ALGORITHM_SWAP_RANGES_H -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/three_way_comp_ref_type.h b/libcxx/include/__cxx03/__algorithm/three_way_comp_ref_type.h index 5702a1fee0826..be6a4e4d01482 100644 --- a/libcxx/include/__cxx03/__algorithm/three_way_comp_ref_type.h +++ b/libcxx/include/__cxx03/__algorithm/three_way_comp_ref_type.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_THREE_WAY_COMP_REF_TYPE_H #define _LIBCPP___ALGORITHM_THREE_WAY_COMP_REF_TYPE_H -#include <__assert> -#include <__compare/ordering.h> -#include <__config> -#include <__utility/declval.h> -#include <__utility/forward.h> +#include <__cxx03/__assert> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/transform.h b/libcxx/include/__cxx03/__algorithm/transform.h index 1b424409591ce..1608932b050b4 100644 --- a/libcxx/include/__cxx03/__algorithm/transform.h +++ b/libcxx/include/__cxx03/__algorithm/transform.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ALGORITHM_TRANSFORM_H #define _LIBCPP___ALGORITHM_TRANSFORM_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__algorithm/uniform_random_bit_generator_adaptor.h b/libcxx/include/__cxx03/__algorithm/uniform_random_bit_generator_adaptor.h index aef0fbfb7c284..25a9e35fd7aa7 100644 --- a/libcxx/include/__cxx03/__algorithm/uniform_random_bit_generator_adaptor.h +++ b/libcxx/include/__cxx03/__algorithm/uniform_random_bit_generator_adaptor.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_UNIFORM_RANDOM_BIT_GENERATOR_ADAPTOR_H #define _LIBCPP___ALGORITHM_RANGES_UNIFORM_RANDOM_BIT_GENERATOR_ADAPTOR_H -#include <__config> -#include <__functional/invoke.h> -#include <__type_traits/remove_cvref.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__type_traits/remove_cvref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -20,7 +20,7 @@ #if _LIBCPP_STD_VER >= 20 _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/unique.h b/libcxx/include/__cxx03/__algorithm/unique.h index d597014596f2e..8d21c63eb2b7d 100644 --- a/libcxx/include/__cxx03/__algorithm/unique.h +++ b/libcxx/include/__cxx03/__algorithm/unique.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___ALGORITHM_UNIQUE_H #define _LIBCPP___ALGORITHM_UNIQUE_H -#include <__algorithm/adjacent_find.h> -#include <__algorithm/comp.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/adjacent_find.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/unique_copy.h b/libcxx/include/__cxx03/__algorithm/unique_copy.h index 16ce80cab32f0..cfa95a58f7b09 100644 --- a/libcxx/include/__cxx03/__algorithm/unique_copy.h +++ b/libcxx/include/__cxx03/__algorithm/unique_copy.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ALGORITHM_UNIQUE_COPY_H #define _LIBCPP___ALGORITHM_UNIQUE_COPY_H -#include <__algorithm/comp.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_base_of.h> -#include <__type_traits/is_same.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_base_of.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/unwrap_iter.h b/libcxx/include/__cxx03/__algorithm/unwrap_iter.h index 8cc0d22d4fc21..b3259af17dd03 100644 --- a/libcxx/include/__cxx03/__algorithm/unwrap_iter.h +++ b/libcxx/include/__cxx03/__algorithm/unwrap_iter.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___ALGORITHM_UNWRAP_ITER_H #define _LIBCPP___ALGORITHM_UNWRAP_ITER_H -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__memory/pointer_traits.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_constructible.h> -#include <__utility/declval.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/unwrap_range.h b/libcxx/include/__cxx03/__algorithm/unwrap_range.h index 2d4b9bb5545ad..26045ef7075af 100644 --- a/libcxx/include/__cxx03/__algorithm/unwrap_range.h +++ b/libcxx/include/__cxx03/__algorithm/unwrap_range.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ALGORITHM_UNWRAP_RANGE_H #define _LIBCPP___ALGORITHM_UNWRAP_RANGE_H -#include <__algorithm/unwrap_iter.h> -#include <__concepts/constructible.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/next.h> -#include <__utility/declval.h> -#include <__utility/move.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__algorithm/upper_bound.h b/libcxx/include/__cxx03/__algorithm/upper_bound.h index c39dec2e89698..069dd6d253e28 100644 --- a/libcxx/include/__cxx03/__algorithm/upper_bound.h +++ b/libcxx/include/__cxx03/__algorithm/upper_bound.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___ALGORITHM_UPPER_BOUND_H #define _LIBCPP___ALGORITHM_UPPER_BOUND_H -#include <__algorithm/comp.h> -#include <__algorithm/half_positive.h> -#include <__algorithm/iterator_operations.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/invoke.h> -#include <__iterator/advance.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_constructible.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/half_positive.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__assert b/libcxx/include/__cxx03/__assert index 49769fb4d4497..19a322934e41d 100644 --- a/libcxx/include/__cxx03/__assert +++ b/libcxx/include/__cxx03/__assert @@ -11,7 +11,7 @@ #define _LIBCPP___ASSERT #include <__assertion_handler> // Note: this include is generated by CMake and is potentially vendor-provided. -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/aliases.h b/libcxx/include/__cxx03/__atomic/aliases.h index e27e09af6b77d..b5a7685eb4fa8 100644 --- a/libcxx/include/__cxx03/__atomic/aliases.h +++ b/libcxx/include/__cxx03/__atomic/aliases.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___ATOMIC_ALIASES_H #define _LIBCPP___ATOMIC_ALIASES_H -#include <__atomic/atomic.h> -#include <__atomic/atomic_lock_free.h> -#include <__atomic/contention_t.h> -#include <__atomic/is_always_lock_free.h> -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/make_unsigned.h> -#include -#include +#include <__cxx03/__atomic/atomic.h> +#include <__cxx03/__atomic/atomic_lock_free.h> +#include <__cxx03/__atomic/contention_t.h> +#include <__cxx03/__atomic/is_always_lock_free.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/atomic.h b/libcxx/include/__cxx03/__atomic/atomic.h index bd3f659c22df0..0e5936e2eb542 100644 --- a/libcxx/include/__cxx03/__atomic/atomic.h +++ b/libcxx/include/__cxx03/__atomic/atomic.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___ATOMIC_ATOMIC_H #define _LIBCPP___ATOMIC_ATOMIC_H -#include <__atomic/atomic_base.h> -#include <__atomic/check_memory_order.h> -#include <__atomic/cxx_atomic_impl.h> -#include <__atomic/memory_order.h> -#include <__config> -#include <__functional/operations.h> -#include <__memory/addressof.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_pointer.h> -#include <__type_traits/remove_volatile.h> -#include <__utility/forward.h> -#include -#include +#include <__cxx03/__atomic/atomic_base.h> +#include <__cxx03/__atomic/check_memory_order.h> +#include <__cxx03/__atomic/cxx_atomic_impl.h> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/__type_traits/remove_volatile.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstring> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/atomic_base.h b/libcxx/include/__cxx03/__atomic/atomic_base.h index 7e26434c9c3a0..ae6aaf4f8284f 100644 --- a/libcxx/include/__cxx03/__atomic/atomic_base.h +++ b/libcxx/include/__cxx03/__atomic/atomic_base.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___ATOMIC_ATOMIC_BASE_H #define _LIBCPP___ATOMIC_ATOMIC_BASE_H -#include <__atomic/atomic_sync.h> -#include <__atomic/check_memory_order.h> -#include <__atomic/cxx_atomic_impl.h> -#include <__atomic/is_always_lock_free.h> -#include <__atomic/memory_order.h> -#include <__config> -#include <__memory/addressof.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_same.h> -#include +#include <__cxx03/__atomic/atomic_sync.h> +#include <__cxx03/__atomic/check_memory_order.h> +#include <__cxx03/__atomic/cxx_atomic_impl.h> +#include <__cxx03/__atomic/is_always_lock_free.h> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/atomic_flag.h b/libcxx/include/__cxx03/__atomic/atomic_flag.h index 00b157cdff78b..6a8471e1f1ec8 100644 --- a/libcxx/include/__cxx03/__atomic/atomic_flag.h +++ b/libcxx/include/__cxx03/__atomic/atomic_flag.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___ATOMIC_ATOMIC_FLAG_H #define _LIBCPP___ATOMIC_ATOMIC_FLAG_H -#include <__atomic/atomic_sync.h> -#include <__atomic/contention_t.h> -#include <__atomic/cxx_atomic_impl.h> -#include <__atomic/memory_order.h> -#include <__chrono/duration.h> -#include <__config> -#include <__memory/addressof.h> -#include <__thread/support.h> -#include +#include <__cxx03/__atomic/atomic_sync.h> +#include <__cxx03/__atomic/contention_t.h> +#include <__cxx03/__atomic/cxx_atomic_impl.h> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__thread/support.h> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/atomic_init.h b/libcxx/include/__cxx03/__atomic/atomic_init.h index 8e86ba31b4ac3..666afb93268d6 100644 --- a/libcxx/include/__cxx03/__atomic/atomic_init.h +++ b/libcxx/include/__cxx03/__atomic/atomic_init.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ATOMIC_ATOMIC_INIT_H #define _LIBCPP___ATOMIC_ATOMIC_INIT_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/atomic_lock_free.h b/libcxx/include/__cxx03/__atomic/atomic_lock_free.h index 0715439db4503..17dfcfd46106e 100644 --- a/libcxx/include/__cxx03/__atomic/atomic_lock_free.h +++ b/libcxx/include/__cxx03/__atomic/atomic_lock_free.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ATOMIC_ATOMIC_LOCK_FREE_H #define _LIBCPP___ATOMIC_ATOMIC_LOCK_FREE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/atomic_ref.h b/libcxx/include/__cxx03/__atomic/atomic_ref.h index b0180a37ab500..c19e6a2e5d26d 100644 --- a/libcxx/include/__cxx03/__atomic/atomic_ref.h +++ b/libcxx/include/__cxx03/__atomic/atomic_ref.h @@ -17,26 +17,26 @@ #ifndef _LIBCPP___ATOMIC_ATOMIC_REF_H #define _LIBCPP___ATOMIC_ATOMIC_REF_H -#include <__assert> -#include <__atomic/atomic_sync.h> -#include <__atomic/check_memory_order.h> -#include <__atomic/to_gcc_order.h> -#include <__concepts/arithmetic.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__memory/addressof.h> -#include <__type_traits/has_unique_object_representation.h> -#include <__type_traits/is_trivially_copyable.h> -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__atomic/atomic_sync.h> +#include <__cxx03/__atomic/check_memory_order.h> +#include <__cxx03/__atomic/to_gcc_order.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/has_unique_object_representation.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/cstring> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__atomic/atomic_sync.h b/libcxx/include/__cxx03/__atomic/atomic_sync.h index aaf81f58731a9..815c8a1459649 100644 --- a/libcxx/include/__cxx03/__atomic/atomic_sync.h +++ b/libcxx/include/__cxx03/__atomic/atomic_sync.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___ATOMIC_ATOMIC_SYNC_H #define _LIBCPP___ATOMIC_ATOMIC_SYNC_H -#include <__atomic/contention_t.h> -#include <__atomic/cxx_atomic_impl.h> -#include <__atomic/memory_order.h> -#include <__atomic/to_gcc_order.h> -#include <__chrono/duration.h> -#include <__config> -#include <__memory/addressof.h> -#include <__thread/poll_with_backoff.h> -#include <__thread/support.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/decay.h> -#include <__type_traits/invoke.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__atomic/contention_t.h> +#include <__cxx03/__atomic/cxx_atomic_impl.h> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__atomic/to_gcc_order.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__thread/poll_with_backoff.h> +#include <__cxx03/__thread/support.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/invoke.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstring> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/check_memory_order.h b/libcxx/include/__cxx03/__atomic/check_memory_order.h index 536f764a61902..11033ef2c3b49 100644 --- a/libcxx/include/__cxx03/__atomic/check_memory_order.h +++ b/libcxx/include/__cxx03/__atomic/check_memory_order.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ATOMIC_CHECK_MEMORY_ORDER_H #define _LIBCPP___ATOMIC_CHECK_MEMORY_ORDER_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/contention_t.h b/libcxx/include/__cxx03/__atomic/contention_t.h index 65890f338ce99..1f069b01b28e7 100644 --- a/libcxx/include/__cxx03/__atomic/contention_t.h +++ b/libcxx/include/__cxx03/__atomic/contention_t.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ATOMIC_CONTENTION_T_H #define _LIBCPP___ATOMIC_CONTENTION_T_H -#include <__atomic/cxx_atomic_impl.h> -#include <__config> -#include +#include <__cxx03/__atomic/cxx_atomic_impl.h> +#include <__cxx03/__config> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/cxx_atomic_impl.h b/libcxx/include/__cxx03/__atomic/cxx_atomic_impl.h index 18e88aa97bec7..f06627f1a8f66 100644 --- a/libcxx/include/__cxx03/__atomic/cxx_atomic_impl.h +++ b/libcxx/include/__cxx03/__atomic/cxx_atomic_impl.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H #define _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H -#include <__atomic/memory_order.h> -#include <__atomic/to_gcc_order.h> -#include <__config> -#include <__memory/addressof.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_trivially_copyable.h> -#include <__type_traits/remove_const.h> -#include +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__atomic/to_gcc_order.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/fence.h b/libcxx/include/__cxx03/__atomic/fence.h index 8c27ea54d62dd..5200cd533bdeb 100644 --- a/libcxx/include/__cxx03/__atomic/fence.h +++ b/libcxx/include/__cxx03/__atomic/fence.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ATOMIC_FENCE_H #define _LIBCPP___ATOMIC_FENCE_H -#include <__atomic/cxx_atomic_impl.h> -#include <__atomic/memory_order.h> -#include <__config> +#include <__cxx03/__atomic/cxx_atomic_impl.h> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/is_always_lock_free.h b/libcxx/include/__cxx03/__atomic/is_always_lock_free.h index f928e79f70cea..29c42d9340fdb 100644 --- a/libcxx/include/__cxx03/__atomic/is_always_lock_free.h +++ b/libcxx/include/__cxx03/__atomic/is_always_lock_free.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ATOMIC_IS_ALWAYS_LOCK_FREE_H #define _LIBCPP___ATOMIC_IS_ALWAYS_LOCK_FREE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/kill_dependency.h b/libcxx/include/__cxx03/__atomic/kill_dependency.h index 103d52d35787f..3deb29f0dbda1 100644 --- a/libcxx/include/__cxx03/__atomic/kill_dependency.h +++ b/libcxx/include/__cxx03/__atomic/kill_dependency.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___ATOMIC_KILL_DEPENDENCY_H #define _LIBCPP___ATOMIC_KILL_DEPENDENCY_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/memory_order.h b/libcxx/include/__cxx03/__atomic/memory_order.h index 294121d1c4e7f..8a7564dc1553e 100644 --- a/libcxx/include/__cxx03/__atomic/memory_order.h +++ b/libcxx/include/__cxx03/__atomic/memory_order.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ATOMIC_MEMORY_ORDER_H #define _LIBCPP___ATOMIC_MEMORY_ORDER_H -#include <__config> -#include <__type_traits/is_same.h> -#include <__type_traits/underlying_type.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/underlying_type.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__atomic/to_gcc_order.h b/libcxx/include/__cxx03/__atomic/to_gcc_order.h index d04c111addd31..41ada88281732 100644 --- a/libcxx/include/__cxx03/__atomic/to_gcc_order.h +++ b/libcxx/include/__cxx03/__atomic/to_gcc_order.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___ATOMIC_TO_GCC_ORDER_H #define _LIBCPP___ATOMIC_TO_GCC_ORDER_H -#include <__atomic/memory_order.h> -#include <__config> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/bit_cast.h b/libcxx/include/__cxx03/__bit/bit_cast.h index cd04567381793..9f88805e125dc 100644 --- a/libcxx/include/__cxx03/__bit/bit_cast.h +++ b/libcxx/include/__cxx03/__bit/bit_cast.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___BIT_BIT_CAST_H #define _LIBCPP___BIT_BIT_CAST_H -#include <__config> -#include <__type_traits/is_trivially_copyable.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_trivially_copyable.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/bit_ceil.h b/libcxx/include/__cxx03/__bit/bit_ceil.h index cfd792dc2e2ad..b5cb5cc08f422 100644 --- a/libcxx/include/__cxx03/__bit/bit_ceil.h +++ b/libcxx/include/__cxx03/__bit/bit_ceil.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___BIT_BIT_CEIL_H #define _LIBCPP___BIT_BIT_CEIL_H -#include <__assert> -#include <__bit/countl.h> -#include <__concepts/arithmetic.h> -#include <__config> -#include +#include <__cxx03/__assert> +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/bit_floor.h b/libcxx/include/__cxx03/__bit/bit_floor.h index 133e369504e43..93379be5e2162 100644 --- a/libcxx/include/__cxx03/__bit/bit_floor.h +++ b/libcxx/include/__cxx03/__bit/bit_floor.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___BIT_BIT_FLOOR_H #define _LIBCPP___BIT_BIT_FLOOR_H -#include <__bit/bit_log2.h> -#include <__concepts/arithmetic.h> -#include <__config> -#include +#include <__cxx03/__bit/bit_log2.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/bit_log2.h b/libcxx/include/__cxx03/__bit/bit_log2.h index 62936f6786860..16a7fb1ec3364 100644 --- a/libcxx/include/__cxx03/__bit/bit_log2.h +++ b/libcxx/include/__cxx03/__bit/bit_log2.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___BIT_BIT_LOG2_H #define _LIBCPP___BIT_BIT_LOG2_H -#include <__bit/countl.h> -#include <__concepts/arithmetic.h> -#include <__config> -#include +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/bit_width.h b/libcxx/include/__cxx03/__bit/bit_width.h index 853e481776f7d..4832359f75326 100644 --- a/libcxx/include/__cxx03/__bit/bit_width.h +++ b/libcxx/include/__cxx03/__bit/bit_width.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___BIT_BIT_WIDTH_H #define _LIBCPP___BIT_BIT_WIDTH_H -#include <__bit/bit_log2.h> -#include <__concepts/arithmetic.h> -#include <__config> +#include <__cxx03/__bit/bit_log2.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/blsr.h b/libcxx/include/__cxx03/__bit/blsr.h index 76bd521f5c307..5375bdf9208d7 100644 --- a/libcxx/include/__cxx03/__bit/blsr.h +++ b/libcxx/include/__cxx03/__bit/blsr.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___BIT_BLSR_H #define _LIBCPP___BIT_BLSR_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/byteswap.h b/libcxx/include/__cxx03/__bit/byteswap.h index 6225ecf2f92df..6e65600512284 100644 --- a/libcxx/include/__cxx03/__bit/byteswap.h +++ b/libcxx/include/__cxx03/__bit/byteswap.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___BIT_BYTESWAP_H #define _LIBCPP___BIT_BYTESWAP_H -#include <__concepts/arithmetic.h> -#include <__config> -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/countl.h b/libcxx/include/__cxx03/__bit/countl.h index 998a0b44c19dc..9a4a269a7da2d 100644 --- a/libcxx/include/__cxx03/__bit/countl.h +++ b/libcxx/include/__cxx03/__bit/countl.h @@ -12,18 +12,18 @@ #ifndef _LIBCPP___BIT_COUNTL_H #define _LIBCPP___BIT_COUNTL_H -#include <__bit/rotate.h> -#include <__concepts/arithmetic.h> -#include <__config> -#include <__type_traits/is_unsigned_integer.h> -#include +#include <__cxx03/__bit/rotate.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_unsigned_integer.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__bit/countr.h b/libcxx/include/__cxx03/__bit/countr.h index 9e92021fba355..34525f591048b 100644 --- a/libcxx/include/__cxx03/__bit/countr.h +++ b/libcxx/include/__cxx03/__bit/countr.h @@ -12,17 +12,17 @@ #ifndef _LIBCPP___BIT_COUNTR_H #define _LIBCPP___BIT_COUNTR_H -#include <__bit/rotate.h> -#include <__concepts/arithmetic.h> -#include <__config> -#include +#include <__cxx03/__bit/rotate.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__bit/endian.h b/libcxx/include/__cxx03/__bit/endian.h index 2d31e5ddff4f1..ba0a5ac4c2d08 100644 --- a/libcxx/include/__cxx03/__bit/endian.h +++ b/libcxx/include/__cxx03/__bit/endian.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___BIT_ENDIAN_H #define _LIBCPP___BIT_ENDIAN_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/has_single_bit.h b/libcxx/include/__cxx03/__bit/has_single_bit.h index 52f5853a1bc8a..8f69d343a6087 100644 --- a/libcxx/include/__cxx03/__bit/has_single_bit.h +++ b/libcxx/include/__cxx03/__bit/has_single_bit.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___BIT_HAS_SINGLE_BIT_H #define _LIBCPP___BIT_HAS_SINGLE_BIT_H -#include <__concepts/arithmetic.h> -#include <__config> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__bit/invert_if.h b/libcxx/include/__cxx03/__bit/invert_if.h index f7606ede26da0..0b729bc79c192 100644 --- a/libcxx/include/__cxx03/__bit/invert_if.h +++ b/libcxx/include/__cxx03/__bit/invert_if.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___BIT_INVERT_IF_H #define _LIBCPP___BIT_INVERT_IF_H -#include <__concepts/arithmetic.h> -#include <__config> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit/popcount.h b/libcxx/include/__cxx03/__bit/popcount.h index 5cf0a01d07338..70a12ea260741 100644 --- a/libcxx/include/__cxx03/__bit/popcount.h +++ b/libcxx/include/__cxx03/__bit/popcount.h @@ -12,17 +12,17 @@ #ifndef _LIBCPP___BIT_POPCOUNT_H #define _LIBCPP___BIT_POPCOUNT_H -#include <__bit/rotate.h> -#include <__concepts/arithmetic.h> -#include <__config> -#include +#include <__cxx03/__bit/rotate.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__bit/rotate.h b/libcxx/include/__cxx03/__bit/rotate.h index 90e430e9d0425..02b4ac66af109 100644 --- a/libcxx/include/__cxx03/__bit/rotate.h +++ b/libcxx/include/__cxx03/__bit/rotate.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___BIT_ROTATE_H #define _LIBCPP___BIT_ROTATE_H -#include <__concepts/arithmetic.h> -#include <__config> -#include <__type_traits/is_unsigned_integer.h> -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_unsigned_integer.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__bit_reference b/libcxx/include/__cxx03/__bit_reference index 22637d4397412..7339f5bd8c9e1 100644 --- a/libcxx/include/__cxx03/__bit_reference +++ b/libcxx/include/__cxx03/__bit_reference @@ -10,28 +10,28 @@ #ifndef _LIBCPP___BIT_REFERENCE #define _LIBCPP___BIT_REFERENCE -#include <__algorithm/copy_n.h> -#include <__algorithm/fill_n.h> -#include <__algorithm/min.h> -#include <__bit/countr.h> -#include <__bit/invert_if.h> -#include <__bit/popcount.h> -#include <__compare/ordering.h> -#include <__config> -#include <__fwd/bit_reference.h> -#include <__iterator/iterator_traits.h> -#include <__memory/construct_at.h> -#include <__memory/pointer_traits.h> -#include <__type_traits/conditional.h> -#include <__utility/swap.h> -#include +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__bit/countr.h> +#include <__cxx03/__bit/invert_if.h> +#include <__cxx03/__bit/popcount.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/bit_reference.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cstring> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__charconv/chars_format.h b/libcxx/include/__cxx03/__charconv/chars_format.h index c76cebd5d1847..c206289c0c258 100644 --- a/libcxx/include/__cxx03/__charconv/chars_format.h +++ b/libcxx/include/__cxx03/__charconv/chars_format.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___CHARCONV_CHARS_FORMAT_H #define _LIBCPP___CHARCONV_CHARS_FORMAT_H -#include <__config> -#include <__utility/to_underlying.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/to_underlying.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__charconv/from_chars_integral.h b/libcxx/include/__cxx03/__charconv/from_chars_integral.h index c1f033b37b913..a3d6e5537280d 100644 --- a/libcxx/include/__cxx03/__charconv/from_chars_integral.h +++ b/libcxx/include/__cxx03/__charconv/from_chars_integral.h @@ -10,26 +10,26 @@ #ifndef _LIBCPP___CHARCONV_FROM_CHARS_INTEGRAL_H #define _LIBCPP___CHARCONV_FROM_CHARS_INTEGRAL_H -#include <__algorithm/copy_n.h> -#include <__assert> -#include <__charconv/from_chars_result.h> -#include <__charconv/traits.h> -#include <__config> -#include <__memory/addressof.h> -#include <__system_error/errc.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_unsigned.h> -#include <__type_traits/make_unsigned.h> -#include +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__assert> +#include <__cxx03/__charconv/from_chars_result.h> +#include <__cxx03/__charconv/traits.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -155,9 +155,9 @@ __from_chars_atoi(const char* __first, const char* __last, _Tp& __value) { /* // Code used to generate __from_chars_log2f_lut. -#include -#include -#include +#include <__cxx03/cmath> +#include <__cxx03/format> +#include <__cxx03/iostream> int main() { for (int i = 2; i <= 36; ++i) diff --git a/libcxx/include/__cxx03/__charconv/from_chars_result.h b/libcxx/include/__cxx03/__charconv/from_chars_result.h index a7bfd6530a8a0..8bba738969e34 100644 --- a/libcxx/include/__cxx03/__charconv/from_chars_result.h +++ b/libcxx/include/__cxx03/__charconv/from_chars_result.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___CHARCONV_FROM_CHARS_RESULT_H #define _LIBCPP___CHARCONV_FROM_CHARS_RESULT_H -#include <__config> -#include <__system_error/errc.h> +#include <__cxx03/__config> +#include <__cxx03/__system_error/errc.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__charconv/tables.h b/libcxx/include/__cxx03/__charconv/tables.h index 6b93536b8c1ba..110352a4be0c7 100644 --- a/libcxx/include/__cxx03/__charconv/tables.h +++ b/libcxx/include/__cxx03/__charconv/tables.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___CHARCONV_TABLES #define _LIBCPP___CHARCONV_TABLES -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__charconv/to_chars.h b/libcxx/include/__cxx03/__charconv/to_chars.h index 8ef09af737559..2db35ace09439 100644 --- a/libcxx/include/__cxx03/__charconv/to_chars.h +++ b/libcxx/include/__cxx03/__charconv/to_chars.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHARCONV_TO_CHARS #define _LIBCPP___CHARCONV_TO_CHARS -#include <__charconv/to_chars_floating_point.h> -#include <__charconv/to_chars_integral.h> -#include <__config> +#include <__cxx03/__charconv/to_chars_floating_point.h> +#include <__cxx03/__charconv/to_chars_integral.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__charconv/to_chars_base_10.h b/libcxx/include/__cxx03/__charconv/to_chars_base_10.h index c49f4f6797aa4..33baaf63caeef 100644 --- a/libcxx/include/__cxx03/__charconv/to_chars_base_10.h +++ b/libcxx/include/__cxx03/__charconv/to_chars_base_10.h @@ -10,19 +10,19 @@ #ifndef _LIBCPP___CHARCONV_TO_CHARS_BASE_10_H #define _LIBCPP___CHARCONV_TO_CHARS_BASE_10_H -#include <__algorithm/copy_n.h> -#include <__assert> -#include <__charconv/tables.h> -#include <__config> -#include -#include +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__assert> +#include <__cxx03/__charconv/tables.h> +#include <__cxx03/__config> +#include <__cxx03/cstdint> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__charconv/to_chars_floating_point.h b/libcxx/include/__cxx03/__charconv/to_chars_floating_point.h index 118f316b21a10..305b7e71a6b7e 100644 --- a/libcxx/include/__cxx03/__charconv/to_chars_floating_point.h +++ b/libcxx/include/__cxx03/__charconv/to_chars_floating_point.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H #define _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H -#include <__charconv/chars_format.h> -#include <__charconv/to_chars_result.h> -#include <__config> +#include <__cxx03/__charconv/chars_format.h> +#include <__cxx03/__charconv/to_chars_result.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__charconv/to_chars_integral.h b/libcxx/include/__cxx03/__charconv/to_chars_integral.h index 0369f4dfb9bda..0b3f319e75125 100644 --- a/libcxx/include/__cxx03/__charconv/to_chars_integral.h +++ b/libcxx/include/__cxx03/__charconv/to_chars_integral.h @@ -10,31 +10,31 @@ #ifndef _LIBCPP___CHARCONV_TO_CHARS_INTEGRAL_H #define _LIBCPP___CHARCONV_TO_CHARS_INTEGRAL_H -#include <__algorithm/copy_n.h> -#include <__assert> -#include <__bit/countl.h> -#include <__charconv/tables.h> -#include <__charconv/to_chars_base_10.h> -#include <__charconv/to_chars_result.h> -#include <__charconv/traits.h> -#include <__config> -#include <__system_error/errc.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include <__type_traits/make_32_64_or_128_bit.h> -#include <__type_traits/make_unsigned.h> -#include <__utility/unreachable.h> -#include -#include -#include +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__assert> +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__charconv/tables.h> +#include <__cxx03/__charconv/to_chars_base_10.h> +#include <__cxx03/__charconv/to_chars_result.h> +#include <__cxx03/__charconv/traits.h> +#include <__cxx03/__config> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/make_32_64_or_128_bit.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__charconv/to_chars_result.h b/libcxx/include/__cxx03/__charconv/to_chars_result.h index 8df0897a49fbb..72fef89dcc5f5 100644 --- a/libcxx/include/__cxx03/__charconv/to_chars_result.h +++ b/libcxx/include/__cxx03/__charconv/to_chars_result.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___CHARCONV_TO_CHARS_RESULT_H #define _LIBCPP___CHARCONV_TO_CHARS_RESULT_H -#include <__config> -#include <__system_error/errc.h> +#include <__cxx03/__config> +#include <__cxx03/__system_error/errc.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__charconv/traits.h b/libcxx/include/__cxx03/__charconv/traits.h index c91c6da324797..5eafa6caa39e5 100644 --- a/libcxx/include/__cxx03/__charconv/traits.h +++ b/libcxx/include/__cxx03/__charconv/traits.h @@ -10,22 +10,22 @@ #ifndef _LIBCPP___CHARCONV_TRAITS #define _LIBCPP___CHARCONV_TRAITS -#include <__assert> -#include <__bit/countl.h> -#include <__charconv/tables.h> -#include <__charconv/to_chars_base_10.h> -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_unsigned.h> -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__charconv/tables.h> +#include <__cxx03/__charconv/to_chars_base_10.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/cstdint> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__chrono/calendar.h b/libcxx/include/__cxx03/__chrono/calendar.h index bb1c5e7ebc8d0..5d8e396851ae8 100644 --- a/libcxx/include/__cxx03/__chrono/calendar.h +++ b/libcxx/include/__cxx03/__chrono/calendar.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHRONO_CALENDAR_H #define _LIBCPP___CHRONO_CALENDAR_H -#include <__chrono/duration.h> -#include <__chrono/time_point.h> -#include <__config> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/concepts.h b/libcxx/include/__cxx03/__chrono/concepts.h index 61ec256b23abb..1fb5241055474 100644 --- a/libcxx/include/__cxx03/__chrono/concepts.h +++ b/libcxx/include/__cxx03/__chrono/concepts.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___CHRONO_CONCEPTS_H #define _LIBCPP___CHRONO_CONCEPTS_H -#include <__chrono/hh_mm_ss.h> -#include <__chrono/time_point.h> -#include <__config> -#include <__type_traits/is_specialization.h> +#include <__cxx03/__chrono/hh_mm_ss.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_specialization.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/convert_to_timespec.h b/libcxx/include/__cxx03/__chrono/convert_to_timespec.h index 11e0b826d05b4..df0e7714ddd70 100644 --- a/libcxx/include/__cxx03/__chrono/convert_to_timespec.h +++ b/libcxx/include/__cxx03/__chrono/convert_to_timespec.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___CHRONO_CONVERT_TO_TIMESPEC_H #define _LIBCPP___CHRONO_CONVERT_TO_TIMESPEC_H -#include <__chrono/duration.h> -#include <__config> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__chrono/convert_to_tm.h b/libcxx/include/__cxx03/__chrono/convert_to_tm.h index 3a51019b80784..ccc7ee7b11973 100644 --- a/libcxx/include/__cxx03/__chrono/convert_to_tm.h +++ b/libcxx/include/__cxx03/__chrono/convert_to_tm.h @@ -10,42 +10,42 @@ #ifndef _LIBCPP___CHRONO_CONVERT_TO_TM_H #define _LIBCPP___CHRONO_CONVERT_TO_TM_H -#include <__chrono/calendar.h> -#include <__chrono/concepts.h> -#include <__chrono/day.h> -#include <__chrono/duration.h> -#include <__chrono/file_clock.h> -#include <__chrono/hh_mm_ss.h> -#include <__chrono/local_info.h> -#include <__chrono/month.h> -#include <__chrono/month_weekday.h> -#include <__chrono/monthday.h> -#include <__chrono/statically_widen.h> -#include <__chrono/sys_info.h> -#include <__chrono/system_clock.h> -#include <__chrono/time_point.h> -#include <__chrono/weekday.h> -#include <__chrono/year.h> -#include <__chrono/year_month.h> -#include <__chrono/year_month_day.h> -#include <__chrono/year_month_weekday.h> -#include <__chrono/zoned_time.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/format_error.h> -#include <__memory/addressof.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_specialization.h> -#include -#include -#include +#include <__cxx03/__chrono/calendar.h> +#include <__cxx03/__chrono/concepts.h> +#include <__cxx03/__chrono/day.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/file_clock.h> +#include <__cxx03/__chrono/hh_mm_ss.h> +#include <__cxx03/__chrono/local_info.h> +#include <__cxx03/__chrono/month.h> +#include <__cxx03/__chrono/month_weekday.h> +#include <__cxx03/__chrono/monthday.h> +#include <__cxx03/__chrono/statically_widen.h> +#include <__cxx03/__chrono/sys_info.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__chrono/weekday.h> +#include <__cxx03/__chrono/year.h> +#include <__cxx03/__chrono/year_month.h> +#include <__cxx03/__chrono/year_month_day.h> +#include <__cxx03/__chrono/year_month_weekday.h> +#include <__cxx03/__chrono/zoned_time.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_specialization.h> +#include <__cxx03/cstdint> +#include <__cxx03/ctime> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__chrono/day.h b/libcxx/include/__cxx03/__chrono/day.h index 7342084b08c88..6df2aa4283461 100644 --- a/libcxx/include/__cxx03/__chrono/day.h +++ b/libcxx/include/__cxx03/__chrono/day.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHRONO_DAY_H #define _LIBCPP___CHRONO_DAY_H -#include <__chrono/duration.h> -#include <__config> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/duration.h b/libcxx/include/__cxx03/__chrono/duration.h index 1e36d7342836f..5fabe08db1bfc 100644 --- a/libcxx/include/__cxx03/__chrono/duration.h +++ b/libcxx/include/__cxx03/__chrono/duration.h @@ -10,22 +10,22 @@ #ifndef _LIBCPP___CHRONO_DURATION_H #define _LIBCPP___CHRONO_DURATION_H -#include <__compare/ordering.h> -#include <__compare/three_way_comparable.h> -#include <__config> -#include <__type_traits/common_type.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_floating_point.h> -#include -#include +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/limits> +#include <__cxx03/ratio> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -544,7 +544,7 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/type_traits> #endif #endif // _LIBCPP___CHRONO_DURATION_H diff --git a/libcxx/include/__cxx03/__chrono/exception.h b/libcxx/include/__cxx03/__chrono/exception.h index 266f8fac44176..3e1a29c203772 100644 --- a/libcxx/include/__cxx03/__chrono/exception.h +++ b/libcxx/include/__cxx03/__chrono/exception.h @@ -12,19 +12,19 @@ #ifndef _LIBCPP___CHRONO_EXCEPTION_H #define _LIBCPP___CHRONO_EXCEPTION_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__chrono/calendar.h> -# include <__chrono/local_info.h> -# include <__chrono/time_point.h> -# include <__config> -# include <__configuration/availability.h> -# include <__verbose_abort> -# include -# include -# include +# include <__cxx03/__chrono/calendar.h> +# include <__cxx03/__chrono/local_info.h> +# include <__cxx03/__chrono/time_point.h> +# include <__cxx03/__config> +# include <__cxx03/__configuration/availability.h> +# include <__cxx03/__verbose_abort> +# include <__cxx03/format> +# include <__cxx03/stdexcept> +# include <__cxx03/string> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/file_clock.h b/libcxx/include/__cxx03/__chrono/file_clock.h index 4dd3f88ce5ba4..aef92057bd04a 100644 --- a/libcxx/include/__cxx03/__chrono/file_clock.h +++ b/libcxx/include/__cxx03/__chrono/file_clock.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___CHRONO_FILE_CLOCK_H #define _LIBCPP___CHRONO_FILE_CLOCK_H -#include <__chrono/duration.h> -#include <__chrono/system_clock.h> -#include <__chrono/time_point.h> -#include <__config> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> +#include <__cxx03/ratio> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/formatter.h b/libcxx/include/__cxx03/__chrono/formatter.h index 449c415e95760..c46dfa3a2da2c 100644 --- a/libcxx/include/__cxx03/__chrono/formatter.h +++ b/libcxx/include/__cxx03/__chrono/formatter.h @@ -10,47 +10,47 @@ #ifndef _LIBCPP___CHRONO_FORMATTER_H #define _LIBCPP___CHRONO_FORMATTER_H -#include <__algorithm/ranges_copy.h> -#include <__chrono/calendar.h> -#include <__chrono/concepts.h> -#include <__chrono/convert_to_tm.h> -#include <__chrono/day.h> -#include <__chrono/duration.h> -#include <__chrono/file_clock.h> -#include <__chrono/hh_mm_ss.h> -#include <__chrono/local_info.h> -#include <__chrono/month.h> -#include <__chrono/month_weekday.h> -#include <__chrono/monthday.h> -#include <__chrono/ostream.h> -#include <__chrono/parser_std_format_spec.h> -#include <__chrono/statically_widen.h> -#include <__chrono/sys_info.h> -#include <__chrono/system_clock.h> -#include <__chrono/time_point.h> -#include <__chrono/weekday.h> -#include <__chrono/year.h> -#include <__chrono/year_month.h> -#include <__chrono/year_month_day.h> -#include <__chrono/year_month_weekday.h> -#include <__chrono/zoned_time.h> -#include <__concepts/arithmetic.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_error.h> -#include <__format/format_functions.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/parser_std_format_spec.h> -#include <__format/write_escaped.h> -#include <__memory/addressof.h> -#include <__type_traits/is_specialization.h> -#include -#include -#include -#include -#include +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__chrono/calendar.h> +#include <__cxx03/__chrono/concepts.h> +#include <__cxx03/__chrono/convert_to_tm.h> +#include <__cxx03/__chrono/day.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/file_clock.h> +#include <__cxx03/__chrono/hh_mm_ss.h> +#include <__cxx03/__chrono/local_info.h> +#include <__cxx03/__chrono/month.h> +#include <__cxx03/__chrono/month_weekday.h> +#include <__cxx03/__chrono/monthday.h> +#include <__cxx03/__chrono/ostream.h> +#include <__cxx03/__chrono/parser_std_format_spec.h> +#include <__cxx03/__chrono/statically_widen.h> +#include <__cxx03/__chrono/sys_info.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__chrono/weekday.h> +#include <__cxx03/__chrono/year.h> +#include <__cxx03/__chrono/year_month.h> +#include <__cxx03/__chrono/year_month_day.h> +#include <__cxx03/__chrono/year_month_weekday.h> +#include <__cxx03/__chrono/zoned_time.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__format/format_functions.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__format/write_escaped.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/is_specialization.h> +#include <__cxx03/cmath> +#include <__cxx03/ctime> +#include <__cxx03/limits> +#include <__cxx03/sstream> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/hh_mm_ss.h b/libcxx/include/__cxx03/__chrono/hh_mm_ss.h index 57d2247fe6a3c..100687064ed88 100644 --- a/libcxx/include/__cxx03/__chrono/hh_mm_ss.h +++ b/libcxx/include/__cxx03/__chrono/hh_mm_ss.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___CHRONO_HH_MM_SS_H #define _LIBCPP___CHRONO_HH_MM_SS_H -#include <__chrono/duration.h> -#include <__chrono/time_point.h> -#include <__config> -#include <__type_traits/common_type.h> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/ratio> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/high_resolution_clock.h b/libcxx/include/__cxx03/__chrono/high_resolution_clock.h index 0697fd2de9b4d..9c5104338421b 100644 --- a/libcxx/include/__cxx03/__chrono/high_resolution_clock.h +++ b/libcxx/include/__cxx03/__chrono/high_resolution_clock.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHRONO_HIGH_RESOLUTION_CLOCK_H #define _LIBCPP___CHRONO_HIGH_RESOLUTION_CLOCK_H -#include <__chrono/steady_clock.h> -#include <__chrono/system_clock.h> -#include <__config> +#include <__cxx03/__chrono/steady_clock.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/leap_second.h b/libcxx/include/__cxx03/__chrono/leap_second.h index 1a0e7f3107de8..900eff22aa856 100644 --- a/libcxx/include/__cxx03/__chrono/leap_second.h +++ b/libcxx/include/__cxx03/__chrono/leap_second.h @@ -12,17 +12,17 @@ #ifndef _LIBCPP___CHRONO_LEAP_SECOND_H #define _LIBCPP___CHRONO_LEAP_SECOND_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__chrono/duration.h> -# include <__chrono/system_clock.h> -# include <__chrono/time_point.h> -# include <__compare/ordering.h> -# include <__compare/three_way_comparable.h> -# include <__config> -# include <__utility/private_constructor_tag.h> +# include <__cxx03/__chrono/duration.h> +# include <__cxx03/__chrono/system_clock.h> +# include <__cxx03/__chrono/time_point.h> +# include <__cxx03/__compare/ordering.h> +# include <__cxx03/__compare/three_way_comparable.h> +# include <__cxx03/__config> +# include <__cxx03/__utility/private_constructor_tag.h> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/literals.h b/libcxx/include/__cxx03/__chrono/literals.h index 89800440edf43..d299fb97da3c9 100644 --- a/libcxx/include/__cxx03/__chrono/literals.h +++ b/libcxx/include/__cxx03/__chrono/literals.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHRONO_LITERALS_H #define _LIBCPP___CHRONO_LITERALS_H -#include <__chrono/day.h> -#include <__chrono/year.h> -#include <__config> +#include <__cxx03/__chrono/day.h> +#include <__cxx03/__chrono/year.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/local_info.h b/libcxx/include/__cxx03/__chrono/local_info.h index cfe1448904d3f..8e2194c05aba1 100644 --- a/libcxx/include/__cxx03/__chrono/local_info.h +++ b/libcxx/include/__cxx03/__chrono/local_info.h @@ -12,12 +12,12 @@ #ifndef _LIBCPP___CHRONO_LOCAL_INFO_H #define _LIBCPP___CHRONO_LOCAL_INFO_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__chrono/sys_info.h> -# include <__config> +# include <__cxx03/__chrono/sys_info.h> +# include <__cxx03/__config> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/month.h b/libcxx/include/__cxx03/__chrono/month.h index ce5cc21aab7d1..fd66bfea2f60d 100644 --- a/libcxx/include/__cxx03/__chrono/month.h +++ b/libcxx/include/__cxx03/__chrono/month.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHRONO_MONTH_H #define _LIBCPP___CHRONO_MONTH_H -#include <__chrono/duration.h> -#include <__config> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/month_weekday.h b/libcxx/include/__cxx03/__chrono/month_weekday.h index 7919879655214..6595618d822c0 100644 --- a/libcxx/include/__cxx03/__chrono/month_weekday.h +++ b/libcxx/include/__cxx03/__chrono/month_weekday.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHRONO_MONTH_WEEKDAY_H #define _LIBCPP___CHRONO_MONTH_WEEKDAY_H -#include <__chrono/month.h> -#include <__chrono/weekday.h> -#include <__config> +#include <__cxx03/__chrono/month.h> +#include <__cxx03/__chrono/weekday.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/monthday.h b/libcxx/include/__cxx03/__chrono/monthday.h index a89d16e518618..fe85e6b8bab32 100644 --- a/libcxx/include/__cxx03/__chrono/monthday.h +++ b/libcxx/include/__cxx03/__chrono/monthday.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___CHRONO_MONTHDAY_H #define _LIBCPP___CHRONO_MONTHDAY_H -#include <__chrono/calendar.h> -#include <__chrono/day.h> -#include <__chrono/month.h> -#include <__config> -#include +#include <__cxx03/__chrono/calendar.h> +#include <__cxx03/__chrono/day.h> +#include <__cxx03/__chrono/month.h> +#include <__cxx03/__config> +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/ostream.h b/libcxx/include/__cxx03/__chrono/ostream.h index e6c43254eea15..6ac6b2831e117 100644 --- a/libcxx/include/__cxx03/__chrono/ostream.h +++ b/libcxx/include/__cxx03/__chrono/ostream.h @@ -10,29 +10,29 @@ #ifndef _LIBCPP___CHRONO_OSTREAM_H #define _LIBCPP___CHRONO_OSTREAM_H -#include <__chrono/calendar.h> -#include <__chrono/day.h> -#include <__chrono/duration.h> -#include <__chrono/file_clock.h> -#include <__chrono/hh_mm_ss.h> -#include <__chrono/local_info.h> -#include <__chrono/month.h> -#include <__chrono/month_weekday.h> -#include <__chrono/monthday.h> -#include <__chrono/statically_widen.h> -#include <__chrono/sys_info.h> -#include <__chrono/system_clock.h> -#include <__chrono/weekday.h> -#include <__chrono/year.h> -#include <__chrono/year_month.h> -#include <__chrono/year_month_day.h> -#include <__chrono/year_month_weekday.h> -#include <__chrono/zoned_time.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/format_functions.h> -#include <__fwd/ostream.h> -#include +#include <__cxx03/__chrono/calendar.h> +#include <__cxx03/__chrono/day.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/file_clock.h> +#include <__cxx03/__chrono/hh_mm_ss.h> +#include <__cxx03/__chrono/local_info.h> +#include <__cxx03/__chrono/month.h> +#include <__cxx03/__chrono/month_weekday.h> +#include <__cxx03/__chrono/monthday.h> +#include <__cxx03/__chrono/statically_widen.h> +#include <__cxx03/__chrono/sys_info.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/weekday.h> +#include <__cxx03/__chrono/year.h> +#include <__cxx03/__chrono/year_month.h> +#include <__cxx03/__chrono/year_month_day.h> +#include <__cxx03/__chrono/year_month_weekday.h> +#include <__cxx03/__chrono/zoned_time.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/format_functions.h> +#include <__cxx03/__fwd/ostream.h> +#include <__cxx03/ratio> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/parser_std_format_spec.h b/libcxx/include/__cxx03/__chrono/parser_std_format_spec.h index 785bbae198e46..3a09a21ac54b3 100644 --- a/libcxx/include/__cxx03/__chrono/parser_std_format_spec.h +++ b/libcxx/include/__cxx03/__chrono/parser_std_format_spec.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___CHRONO_PARSER_STD_FORMAT_SPEC_H #define _LIBCPP___CHRONO_PARSER_STD_FORMAT_SPEC_H -#include <__config> -#include <__format/concepts.h> -#include <__format/format_error.h> -#include <__format/format_parse_context.h> -#include <__format/formatter_string.h> -#include <__format/parser_std_format_spec.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter_string.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/statically_widen.h b/libcxx/include/__cxx03/__chrono/statically_widen.h index a18c46f057a81..101a903bc3f28 100644 --- a/libcxx/include/__cxx03/__chrono/statically_widen.h +++ b/libcxx/include/__cxx03/__chrono/statically_widen.h @@ -12,9 +12,9 @@ // Implements the STATICALLY-WIDEN exposition-only function. ([time.general]/2) -#include <__concepts/same_as.h> -#include <__config> -#include <__format/concepts.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/steady_clock.h b/libcxx/include/__cxx03/__chrono/steady_clock.h index 612a7f156e634..ad5b64b8142bc 100644 --- a/libcxx/include/__cxx03/__chrono/steady_clock.h +++ b/libcxx/include/__cxx03/__chrono/steady_clock.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___CHRONO_STEADY_CLOCK_H #define _LIBCPP___CHRONO_STEADY_CLOCK_H -#include <__chrono/duration.h> -#include <__chrono/time_point.h> -#include <__config> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/sys_info.h b/libcxx/include/__cxx03/__chrono/sys_info.h index 11536cbde3a37..ed1c7cabd9b60 100644 --- a/libcxx/include/__cxx03/__chrono/sys_info.h +++ b/libcxx/include/__cxx03/__chrono/sys_info.h @@ -12,15 +12,15 @@ #ifndef _LIBCPP___CHRONO_SYS_INFO_H #define _LIBCPP___CHRONO_SYS_INFO_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__chrono/duration.h> -# include <__chrono/system_clock.h> -# include <__chrono/time_point.h> -# include <__config> -# include +# include <__cxx03/__chrono/duration.h> +# include <__cxx03/__chrono/system_clock.h> +# include <__cxx03/__chrono/time_point.h> +# include <__cxx03/__config> +# include <__cxx03/string> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/system_clock.h b/libcxx/include/__cxx03/__chrono/system_clock.h index 5a9eb65bdae7a..61382ebdfbe6d 100644 --- a/libcxx/include/__cxx03/__chrono/system_clock.h +++ b/libcxx/include/__cxx03/__chrono/system_clock.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___CHRONO_SYSTEM_CLOCK_H #define _LIBCPP___CHRONO_SYSTEM_CLOCK_H -#include <__chrono/duration.h> -#include <__chrono/time_point.h> -#include <__config> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> +#include <__cxx03/ctime> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/time_point.h b/libcxx/include/__cxx03/__chrono/time_point.h index aaf0b098f280e..a26423efb15b4 100644 --- a/libcxx/include/__cxx03/__chrono/time_point.h +++ b/libcxx/include/__cxx03/__chrono/time_point.h @@ -10,21 +10,21 @@ #ifndef _LIBCPP___CHRONO_TIME_POINT_H #define _LIBCPP___CHRONO_TIME_POINT_H -#include <__chrono/duration.h> -#include <__compare/ordering.h> -#include <__compare/three_way_comparable.h> -#include <__config> -#include <__type_traits/common_type.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_convertible.h> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__chrono/time_zone.h b/libcxx/include/__cxx03/__chrono/time_zone.h index de11dac1eef0c..f14646787cf03 100644 --- a/libcxx/include/__cxx03/__chrono/time_zone.h +++ b/libcxx/include/__cxx03/__chrono/time_zone.h @@ -12,28 +12,28 @@ #ifndef _LIBCPP___CHRONO_TIME_ZONE_H #define _LIBCPP___CHRONO_TIME_ZONE_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__chrono/calendar.h> -# include <__chrono/duration.h> -# include <__chrono/exception.h> -# include <__chrono/local_info.h> -# include <__chrono/sys_info.h> -# include <__chrono/system_clock.h> -# include <__compare/strong_order.h> -# include <__config> -# include <__memory/unique_ptr.h> -# include <__type_traits/common_type.h> -# include +# include <__cxx03/__chrono/calendar.h> +# include <__cxx03/__chrono/duration.h> +# include <__cxx03/__chrono/exception.h> +# include <__cxx03/__chrono/local_info.h> +# include <__cxx03/__chrono/sys_info.h> +# include <__cxx03/__chrono/system_clock.h> +# include <__cxx03/__compare/strong_order.h> +# include <__cxx03/__config> +# include <__cxx03/__memory/unique_ptr.h> +# include <__cxx03/__type_traits/common_type.h> +# include <__cxx03/string_view> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__chrono/time_zone_link.h b/libcxx/include/__cxx03/__chrono/time_zone_link.h index b2d365c5fd082..a19226b06d866 100644 --- a/libcxx/include/__cxx03/__chrono/time_zone_link.h +++ b/libcxx/include/__cxx03/__chrono/time_zone_link.h @@ -12,22 +12,22 @@ #ifndef _LIBCPP___CHRONO_TIME_ZONE_LINK_H #define _LIBCPP___CHRONO_TIME_ZONE_LINK_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__compare/strong_order.h> -# include <__config> -# include <__utility/private_constructor_tag.h> -# include -# include +# include <__cxx03/__compare/strong_order.h> +# include <__cxx03/__config> +# include <__cxx03/__utility/private_constructor_tag.h> +# include <__cxx03/string> +# include <__cxx03/string_view> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__chrono/tzdb.h b/libcxx/include/__cxx03/__chrono/tzdb.h index f731f8c318be0..cb218314a8e85 100644 --- a/libcxx/include/__cxx03/__chrono/tzdb.h +++ b/libcxx/include/__cxx03/__chrono/tzdb.h @@ -12,24 +12,24 @@ #ifndef _LIBCPP___CHRONO_TZDB_H #define _LIBCPP___CHRONO_TZDB_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__algorithm/ranges_lower_bound.h> -# include <__chrono/leap_second.h> -# include <__chrono/time_zone.h> -# include <__chrono/time_zone_link.h> -# include <__config> -# include -# include +# include <__cxx03/__algorithm/ranges_lower_bound.h> +# include <__cxx03/__chrono/leap_second.h> +# include <__cxx03/__chrono/time_zone.h> +# include <__cxx03/__chrono/time_zone_link.h> +# include <__cxx03/__config> +# include <__cxx03/string> +# include <__cxx03/vector> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__chrono/tzdb_list.h b/libcxx/include/__cxx03/__chrono/tzdb_list.h index aeef4fe1aba3c..96d52da88b108 100644 --- a/libcxx/include/__cxx03/__chrono/tzdb_list.h +++ b/libcxx/include/__cxx03/__chrono/tzdb_list.h @@ -12,15 +12,15 @@ #ifndef _LIBCPP___CHRONO_TZDB_LIST_H #define _LIBCPP___CHRONO_TZDB_LIST_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__chrono/time_zone.h> -# include <__chrono/tzdb.h> -# include <__config> -# include <__fwd/string.h> -# include +# include <__cxx03/__chrono/time_zone.h> +# include <__cxx03/__chrono/tzdb.h> +# include <__cxx03/__config> +# include <__cxx03/__fwd/string.h> +# include <__cxx03/forward_list> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/weekday.h b/libcxx/include/__cxx03/__chrono/weekday.h index 86c780cc71825..d7a0cc8496dcd 100644 --- a/libcxx/include/__cxx03/__chrono/weekday.h +++ b/libcxx/include/__cxx03/__chrono/weekday.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___CHRONO_WEEKDAY_H #define _LIBCPP___CHRONO_WEEKDAY_H -#include <__chrono/calendar.h> -#include <__chrono/duration.h> -#include <__chrono/system_clock.h> -#include <__chrono/time_point.h> -#include <__config> +#include <__cxx03/__chrono/calendar.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/year.h b/libcxx/include/__cxx03/__chrono/year.h index 1899d09f38dbd..2850616a7aa21 100644 --- a/libcxx/include/__cxx03/__chrono/year.h +++ b/libcxx/include/__cxx03/__chrono/year.h @@ -10,17 +10,17 @@ #ifndef _LIBCPP___CHRONO_YEAR_H #define _LIBCPP___CHRONO_YEAR_H -#include <__chrono/duration.h> -#include <__config> -#include -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/compare> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__chrono/year_month.h b/libcxx/include/__cxx03/__chrono/year_month.h index 369ea38f7560d..75784df9386d8 100644 --- a/libcxx/include/__cxx03/__chrono/year_month.h +++ b/libcxx/include/__cxx03/__chrono/year_month.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___CHRONO_YEAR_MONTH_H #define _LIBCPP___CHRONO_YEAR_MONTH_H -#include <__chrono/duration.h> -#include <__chrono/month.h> -#include <__chrono/year.h> -#include <__config> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/month.h> +#include <__cxx03/__chrono/year.h> +#include <__cxx03/__config> +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/year_month_day.h b/libcxx/include/__cxx03/__chrono/year_month_day.h index b06c0be03e0de..dff7423a0ffb3 100644 --- a/libcxx/include/__cxx03/__chrono/year_month_day.h +++ b/libcxx/include/__cxx03/__chrono/year_month_day.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___CHRONO_YEAR_MONTH_DAY_H #define _LIBCPP___CHRONO_YEAR_MONTH_DAY_H -#include <__chrono/calendar.h> -#include <__chrono/day.h> -#include <__chrono/duration.h> -#include <__chrono/month.h> -#include <__chrono/monthday.h> -#include <__chrono/system_clock.h> -#include <__chrono/time_point.h> -#include <__chrono/year.h> -#include <__chrono/year_month.h> -#include <__config> -#include -#include +#include <__cxx03/__chrono/calendar.h> +#include <__cxx03/__chrono/day.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/month.h> +#include <__cxx03/__chrono/monthday.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__chrono/year.h> +#include <__cxx03/__chrono/year_month.h> +#include <__cxx03/__config> +#include <__cxx03/compare> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/year_month_weekday.h b/libcxx/include/__cxx03/__chrono/year_month_weekday.h index 0c3dd494c8787..3177f21964862 100644 --- a/libcxx/include/__cxx03/__chrono/year_month_weekday.h +++ b/libcxx/include/__cxx03/__chrono/year_month_weekday.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___CHRONO_YEAR_MONTH_WEEKDAY_H #define _LIBCPP___CHRONO_YEAR_MONTH_WEEKDAY_H -#include <__chrono/calendar.h> -#include <__chrono/day.h> -#include <__chrono/duration.h> -#include <__chrono/month.h> -#include <__chrono/month_weekday.h> -#include <__chrono/system_clock.h> -#include <__chrono/time_point.h> -#include <__chrono/weekday.h> -#include <__chrono/year.h> -#include <__chrono/year_month.h> -#include <__chrono/year_month_day.h> -#include <__config> +#include <__cxx03/__chrono/calendar.h> +#include <__cxx03/__chrono/day.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/month.h> +#include <__cxx03/__chrono/month_weekday.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__chrono/weekday.h> +#include <__cxx03/__chrono/year.h> +#include <__cxx03/__chrono/year_month.h> +#include <__cxx03/__chrono/year_month_day.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__chrono/zoned_time.h b/libcxx/include/__cxx03/__chrono/zoned_time.h index 8cfa2122642c5..f3333a4189cc5 100644 --- a/libcxx/include/__cxx03/__chrono/zoned_time.h +++ b/libcxx/include/__cxx03/__chrono/zoned_time.h @@ -12,29 +12,29 @@ #ifndef _LIBCPP___CHRONO_ZONED_TIME_H #define _LIBCPP___CHRONO_ZONED_TIME_H -#include +#include <__cxx03/version> // Enable the contents of the header only when libc++ was built with experimental features enabled. #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) -# include <__chrono/calendar.h> -# include <__chrono/duration.h> -# include <__chrono/sys_info.h> -# include <__chrono/system_clock.h> -# include <__chrono/time_zone.h> -# include <__chrono/tzdb_list.h> -# include <__config> -# include <__fwd/string_view.h> -# include <__type_traits/common_type.h> -# include <__type_traits/conditional.h> -# include <__type_traits/remove_cvref.h> -# include <__utility/move.h> +# include <__cxx03/__chrono/calendar.h> +# include <__cxx03/__chrono/duration.h> +# include <__cxx03/__chrono/sys_info.h> +# include <__cxx03/__chrono/system_clock.h> +# include <__cxx03/__chrono/time_zone.h> +# include <__cxx03/__chrono/tzdb_list.h> +# include <__cxx03/__config> +# include <__cxx03/__fwd/string_view.h> +# include <__cxx03/__type_traits/common_type.h> +# include <__cxx03/__type_traits/conditional.h> +# include <__cxx03/__type_traits/remove_cvref.h> +# include <__cxx03/__utility/move.h> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__compare/common_comparison_category.h b/libcxx/include/__cxx03/__compare/common_comparison_category.h index 7aeb3da03a4f4..795884d13177f 100644 --- a/libcxx/include/__cxx03/__compare/common_comparison_category.h +++ b/libcxx/include/__cxx03/__compare/common_comparison_category.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___COMPARE_COMMON_COMPARISON_CATEGORY_H #define _LIBCPP___COMPARE_COMMON_COMPARISON_CATEGORY_H -#include <__compare/ordering.h> -#include <__config> -#include <__type_traits/is_same.h> -#include +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/compare_partial_order_fallback.h b/libcxx/include/__cxx03/__compare/compare_partial_order_fallback.h index e0efa3ccb88db..dc939d8f9a6b8 100644 --- a/libcxx/include/__cxx03/__compare/compare_partial_order_fallback.h +++ b/libcxx/include/__cxx03/__compare/compare_partial_order_fallback.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___COMPARE_COMPARE_PARTIAL_ORDER_FALLBACK #define _LIBCPP___COMPARE_COMPARE_PARTIAL_ORDER_FALLBACK -#include <__compare/ordering.h> -#include <__compare/partial_order.h> -#include <__config> -#include <__type_traits/decay.h> -#include <__type_traits/is_same.h> -#include <__utility/forward.h> -#include <__utility/priority_tag.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/partial_order.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/priority_tag.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/compare_strong_order_fallback.h b/libcxx/include/__cxx03/__compare/compare_strong_order_fallback.h index a94d517ed30fc..5ea1ce7c64d12 100644 --- a/libcxx/include/__cxx03/__compare/compare_strong_order_fallback.h +++ b/libcxx/include/__cxx03/__compare/compare_strong_order_fallback.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___COMPARE_COMPARE_STRONG_ORDER_FALLBACK #define _LIBCPP___COMPARE_COMPARE_STRONG_ORDER_FALLBACK -#include <__compare/ordering.h> -#include <__compare/strong_order.h> -#include <__config> -#include <__type_traits/decay.h> -#include <__type_traits/is_same.h> -#include <__utility/forward.h> -#include <__utility/priority_tag.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/strong_order.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/priority_tag.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/compare_three_way.h b/libcxx/include/__cxx03/__compare/compare_three_way.h index 01c12076c0d73..258ad43103429 100644 --- a/libcxx/include/__cxx03/__compare/compare_three_way.h +++ b/libcxx/include/__cxx03/__compare/compare_three_way.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___COMPARE_COMPARE_THREE_WAY_H #define _LIBCPP___COMPARE_COMPARE_THREE_WAY_H -#include <__compare/three_way_comparable.h> -#include <__config> -#include <__utility/forward.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/compare_three_way_result.h b/libcxx/include/__cxx03/__compare/compare_three_way_result.h index d7508073433af..7577b9ee15f90 100644 --- a/libcxx/include/__cxx03/__compare/compare_three_way_result.h +++ b/libcxx/include/__cxx03/__compare/compare_three_way_result.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___COMPARE_COMPARE_THREE_WAY_RESULT_H #define _LIBCPP___COMPARE_COMPARE_THREE_WAY_RESULT_H -#include <__config> -#include <__type_traits/make_const_lvalue_ref.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/make_const_lvalue_ref.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/compare_weak_order_fallback.h b/libcxx/include/__cxx03/__compare/compare_weak_order_fallback.h index 062b7b582cd7e..e12dc8eb5c9c0 100644 --- a/libcxx/include/__cxx03/__compare/compare_weak_order_fallback.h +++ b/libcxx/include/__cxx03/__compare/compare_weak_order_fallback.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___COMPARE_COMPARE_WEAK_ORDER_FALLBACK #define _LIBCPP___COMPARE_COMPARE_WEAK_ORDER_FALLBACK -#include <__compare/ordering.h> -#include <__compare/weak_order.h> -#include <__config> -#include <__type_traits/decay.h> -#include <__type_traits/is_same.h> -#include <__utility/forward.h> -#include <__utility/priority_tag.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/weak_order.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/priority_tag.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/is_eq.h b/libcxx/include/__cxx03/__compare/is_eq.h index 9a82df1ebe88b..09cc7a2c85d4e 100644 --- a/libcxx/include/__cxx03/__compare/is_eq.h +++ b/libcxx/include/__cxx03/__compare/is_eq.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___COMPARE_IS_EQ_H #define _LIBCPP___COMPARE_IS_EQ_H -#include <__compare/ordering.h> -#include <__config> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/ordering.h b/libcxx/include/__cxx03/__compare/ordering.h index 2995d381304f0..71c199991f330 100644 --- a/libcxx/include/__cxx03/__compare/ordering.h +++ b/libcxx/include/__cxx03/__compare/ordering.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___COMPARE_ORDERING_H #define _LIBCPP___COMPARE_ORDERING_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_same.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_same.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/partial_order.h b/libcxx/include/__cxx03/__compare/partial_order.h index 1d2fae63e5f24..df8b842e7bf84 100644 --- a/libcxx/include/__cxx03/__compare/partial_order.h +++ b/libcxx/include/__cxx03/__compare/partial_order.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___COMPARE_PARTIAL_ORDER #define _LIBCPP___COMPARE_PARTIAL_ORDER -#include <__compare/compare_three_way.h> -#include <__compare/ordering.h> -#include <__compare/weak_order.h> -#include <__config> -#include <__type_traits/decay.h> -#include <__type_traits/is_same.h> -#include <__utility/forward.h> -#include <__utility/priority_tag.h> +#include <__cxx03/__compare/compare_three_way.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/weak_order.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/priority_tag.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/strong_order.h b/libcxx/include/__cxx03/__compare/strong_order.h index 8c363b5638222..7522325912351 100644 --- a/libcxx/include/__cxx03/__compare/strong_order.h +++ b/libcxx/include/__cxx03/__compare/strong_order.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___COMPARE_STRONG_ORDER #define _LIBCPP___COMPARE_STRONG_ORDER -#include <__bit/bit_cast.h> -#include <__compare/compare_three_way.h> -#include <__compare/ordering.h> -#include <__config> -#include <__math/exponential_functions.h> -#include <__math/traits.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_same.h> -#include <__utility/forward.h> -#include <__utility/priority_tag.h> -#include -#include +#include <__cxx03/__bit/bit_cast.h> +#include <__cxx03/__compare/compare_three_way.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__math/exponential_functions.h> +#include <__cxx03/__math/traits.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/priority_tag.h> +#include <__cxx03/cstdint> +#include <__cxx03/limits> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__compare/synth_three_way.h b/libcxx/include/__cxx03/__compare/synth_three_way.h index e48ce49799836..fbaaf61864299 100644 --- a/libcxx/include/__cxx03/__compare/synth_three_way.h +++ b/libcxx/include/__cxx03/__compare/synth_three_way.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___COMPARE_SYNTH_THREE_WAY_H #define _LIBCPP___COMPARE_SYNTH_THREE_WAY_H -#include <__compare/ordering.h> -#include <__compare/three_way_comparable.h> -#include <__concepts/boolean_testable.h> -#include <__config> -#include <__utility/declval.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__concepts/boolean_testable.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/three_way_comparable.h b/libcxx/include/__cxx03/__compare/three_way_comparable.h index 7a44ea9158a6f..9ad84d2bde498 100644 --- a/libcxx/include/__cxx03/__compare/three_way_comparable.h +++ b/libcxx/include/__cxx03/__compare/three_way_comparable.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___COMPARE_THREE_WAY_COMPARABLE_H #define _LIBCPP___COMPARE_THREE_WAY_COMPARABLE_H -#include <__compare/common_comparison_category.h> -#include <__compare/ordering.h> -#include <__concepts/common_reference_with.h> -#include <__concepts/equality_comparable.h> -#include <__concepts/same_as.h> -#include <__concepts/totally_ordered.h> -#include <__config> -#include <__type_traits/common_reference.h> -#include <__type_traits/make_const_lvalue_ref.h> +#include <__cxx03/__compare/common_comparison_category.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__concepts/common_reference_with.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__concepts/totally_ordered.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_reference.h> +#include <__cxx03/__type_traits/make_const_lvalue_ref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__compare/weak_order.h b/libcxx/include/__cxx03/__compare/weak_order.h index 1a3e85feb233b..e4c4797c06db6 100644 --- a/libcxx/include/__cxx03/__compare/weak_order.h +++ b/libcxx/include/__cxx03/__compare/weak_order.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___COMPARE_WEAK_ORDER #define _LIBCPP___COMPARE_WEAK_ORDER -#include <__compare/compare_three_way.h> -#include <__compare/ordering.h> -#include <__compare/strong_order.h> -#include <__config> -#include <__math/traits.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_same.h> -#include <__utility/forward.h> -#include <__utility/priority_tag.h> +#include <__cxx03/__compare/compare_three_way.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/strong_order.h> +#include <__cxx03/__config> +#include <__cxx03/__math/traits.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/priority_tag.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/arithmetic.h b/libcxx/include/__cxx03/__concepts/arithmetic.h index 0c44f117805f3..a8ef1d1532cac 100644 --- a/libcxx/include/__cxx03/__concepts/arithmetic.h +++ b/libcxx/include/__cxx03/__concepts/arithmetic.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___CONCEPTS_ARITHMETIC_H #define _LIBCPP___CONCEPTS_ARITHMETIC_H -#include <__config> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_signed.h> -#include <__type_traits/is_signed_integer.h> -#include <__type_traits/is_unsigned_integer.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/is_signed_integer.h> +#include <__cxx03/__type_traits/is_unsigned_integer.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/assignable.h b/libcxx/include/__cxx03/__concepts/assignable.h index 7423daabba780..563deb5e4cd69 100644 --- a/libcxx/include/__cxx03/__concepts/assignable.h +++ b/libcxx/include/__cxx03/__concepts/assignable.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___CONCEPTS_ASSIGNABLE_H #define _LIBCPP___CONCEPTS_ASSIGNABLE_H -#include <__concepts/common_reference_with.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__type_traits/is_reference.h> -#include <__type_traits/make_const_lvalue_ref.h> -#include <__utility/forward.h> +#include <__cxx03/__concepts/common_reference_with.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/make_const_lvalue_ref.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/boolean_testable.h b/libcxx/include/__cxx03/__concepts/boolean_testable.h index b379fe9c5a880..1f655c1b19cdf 100644 --- a/libcxx/include/__cxx03/__concepts/boolean_testable.h +++ b/libcxx/include/__cxx03/__concepts/boolean_testable.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___CONCEPTS_BOOLEAN_TESTABLE_H #define _LIBCPP___CONCEPTS_BOOLEAN_TESTABLE_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__utility/forward.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/class_or_enum.h b/libcxx/include/__cxx03/__concepts/class_or_enum.h index 2739e31e14ba6..ef9a61cdbebf7 100644 --- a/libcxx/include/__cxx03/__concepts/class_or_enum.h +++ b/libcxx/include/__cxx03/__concepts/class_or_enum.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___CONCEPTS_CLASS_OR_ENUM_H #define _LIBCPP___CONCEPTS_CLASS_OR_ENUM_H -#include <__config> -#include <__type_traits/is_class.h> -#include <__type_traits/is_enum.h> -#include <__type_traits/is_union.h> -#include <__type_traits/remove_cvref.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_class.h> +#include <__cxx03/__type_traits/is_enum.h> +#include <__cxx03/__type_traits/is_union.h> +#include <__cxx03/__type_traits/remove_cvref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/common_reference_with.h b/libcxx/include/__cxx03/__concepts/common_reference_with.h index 4eb687e071bc5..8e0b5bafe7862 100644 --- a/libcxx/include/__cxx03/__concepts/common_reference_with.h +++ b/libcxx/include/__cxx03/__concepts/common_reference_with.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___CONCEPTS_COMMON_REFERENCE_WITH_H #define _LIBCPP___CONCEPTS_COMMON_REFERENCE_WITH_H -#include <__concepts/convertible_to.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__type_traits/common_reference.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/common_with.h b/libcxx/include/__cxx03/__concepts/common_with.h index 85abb05efbc29..37f4e3c30c2b2 100644 --- a/libcxx/include/__cxx03/__concepts/common_with.h +++ b/libcxx/include/__cxx03/__concepts/common_with.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___CONCEPTS_COMMON_WITH_H #define _LIBCPP___CONCEPTS_COMMON_WITH_H -#include <__concepts/common_reference_with.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/common_reference.h> -#include <__type_traits/common_type.h> -#include <__utility/declval.h> +#include <__cxx03/__concepts/common_reference_with.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/common_reference.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/constructible.h b/libcxx/include/__cxx03/__concepts/constructible.h index 835a44429c092..356ca47626071 100644 --- a/libcxx/include/__cxx03/__concepts/constructible.h +++ b/libcxx/include/__cxx03/__concepts/constructible.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___CONCEPTS_CONSTRUCTIBLE_H #define _LIBCPP___CONCEPTS_CONSTRUCTIBLE_H -#include <__concepts/convertible_to.h> -#include <__concepts/destructible.h> -#include <__config> -#include <__type_traits/is_constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/destructible.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_constructible.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/convertible_to.h b/libcxx/include/__cxx03/__concepts/convertible_to.h index 6d5b6c1268d5d..4802621c93ef7 100644 --- a/libcxx/include/__cxx03/__concepts/convertible_to.h +++ b/libcxx/include/__cxx03/__concepts/convertible_to.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___CONCEPTS_CONVERTIBLE_TO_H #define _LIBCPP___CONCEPTS_CONVERTIBLE_TO_H -#include <__config> -#include <__type_traits/is_convertible.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/copyable.h b/libcxx/include/__cxx03/__concepts/copyable.h index 2bf0ad42fc1a8..bcadc7382f50f 100644 --- a/libcxx/include/__cxx03/__concepts/copyable.h +++ b/libcxx/include/__cxx03/__concepts/copyable.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___CONCEPTS_COPYABLE_H #define _LIBCPP___CONCEPTS_COPYABLE_H -#include <__concepts/assignable.h> -#include <__concepts/constructible.h> -#include <__concepts/movable.h> -#include <__config> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/movable.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/derived_from.h b/libcxx/include/__cxx03/__concepts/derived_from.h index 9875faee81b90..861c84f0cf6ba 100644 --- a/libcxx/include/__cxx03/__concepts/derived_from.h +++ b/libcxx/include/__cxx03/__concepts/derived_from.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___CONCEPTS_DERIVED_FROM_H #define _LIBCPP___CONCEPTS_DERIVED_FROM_H -#include <__config> -#include <__type_traits/is_base_of.h> -#include <__type_traits/is_convertible.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_base_of.h> +#include <__cxx03/__type_traits/is_convertible.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/destructible.h b/libcxx/include/__cxx03/__concepts/destructible.h index 28b4b1bc24ec9..5aa9a908648cd 100644 --- a/libcxx/include/__cxx03/__concepts/destructible.h +++ b/libcxx/include/__cxx03/__concepts/destructible.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___CONCEPTS_DESTRUCTIBLE_H #define _LIBCPP___CONCEPTS_DESTRUCTIBLE_H -#include <__config> -#include <__type_traits/is_nothrow_destructible.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_nothrow_destructible.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/different_from.h b/libcxx/include/__cxx03/__concepts/different_from.h index fd31f6e25805d..bedf3a0bf0b7f 100644 --- a/libcxx/include/__cxx03/__concepts/different_from.h +++ b/libcxx/include/__cxx03/__concepts/different_from.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___CONCEPTS_DIFFERENT_FROM_H #define _LIBCPP___CONCEPTS_DIFFERENT_FROM_H -#include <__concepts/same_as.h> -#include <__config> -#include <__type_traits/remove_cvref.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/remove_cvref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/equality_comparable.h b/libcxx/include/__cxx03/__concepts/equality_comparable.h index 278fc76409289..a77f8f3de60b2 100644 --- a/libcxx/include/__cxx03/__concepts/equality_comparable.h +++ b/libcxx/include/__cxx03/__concepts/equality_comparable.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___CONCEPTS_EQUALITY_COMPARABLE_H #define _LIBCPP___CONCEPTS_EQUALITY_COMPARABLE_H -#include <__concepts/boolean_testable.h> -#include <__concepts/common_reference_with.h> -#include <__config> -#include <__type_traits/common_reference.h> -#include <__type_traits/make_const_lvalue_ref.h> +#include <__cxx03/__concepts/boolean_testable.h> +#include <__cxx03/__concepts/common_reference_with.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_reference.h> +#include <__cxx03/__type_traits/make_const_lvalue_ref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/invocable.h b/libcxx/include/__cxx03/__concepts/invocable.h index 8a29398b3a29f..fbbbf16e32fe9 100644 --- a/libcxx/include/__cxx03/__concepts/invocable.h +++ b/libcxx/include/__cxx03/__concepts/invocable.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___CONCEPTS_INVOCABLE_H #define _LIBCPP___CONCEPTS_INVOCABLE_H -#include <__config> -#include <__functional/invoke.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/movable.h b/libcxx/include/__cxx03/__concepts/movable.h index bc5b9d767c6a5..883eda880b6f0 100644 --- a/libcxx/include/__cxx03/__concepts/movable.h +++ b/libcxx/include/__cxx03/__concepts/movable.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___CONCEPTS_MOVABLE_H #define _LIBCPP___CONCEPTS_MOVABLE_H -#include <__concepts/assignable.h> -#include <__concepts/constructible.h> -#include <__concepts/swappable.h> -#include <__config> -#include <__type_traits/is_object.h> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/swappable.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_object.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/predicate.h b/libcxx/include/__cxx03/__concepts/predicate.h index 00731efc8fcd9..15007f3b09926 100644 --- a/libcxx/include/__cxx03/__concepts/predicate.h +++ b/libcxx/include/__cxx03/__concepts/predicate.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___CONCEPTS_PREDICATE_H #define _LIBCPP___CONCEPTS_PREDICATE_H -#include <__concepts/boolean_testable.h> -#include <__concepts/invocable.h> -#include <__config> -#include <__functional/invoke.h> +#include <__cxx03/__concepts/boolean_testable.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/regular.h b/libcxx/include/__cxx03/__concepts/regular.h index 9f3d8bf30be3e..faf04f2c6b943 100644 --- a/libcxx/include/__cxx03/__concepts/regular.h +++ b/libcxx/include/__cxx03/__concepts/regular.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___CONCEPTS_REGULAR_H #define _LIBCPP___CONCEPTS_REGULAR_H -#include <__concepts/equality_comparable.h> -#include <__concepts/semiregular.h> -#include <__config> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__concepts/semiregular.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/relation.h b/libcxx/include/__cxx03/__concepts/relation.h index 7545a7db93da7..0d90406012e33 100644 --- a/libcxx/include/__cxx03/__concepts/relation.h +++ b/libcxx/include/__cxx03/__concepts/relation.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___CONCEPTS_RELATION_H #define _LIBCPP___CONCEPTS_RELATION_H -#include <__concepts/predicate.h> -#include <__config> +#include <__cxx03/__concepts/predicate.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/same_as.h b/libcxx/include/__cxx03/__concepts/same_as.h index 4241131c70c1f..6c81ecfbbfe5b 100644 --- a/libcxx/include/__cxx03/__concepts/same_as.h +++ b/libcxx/include/__cxx03/__concepts/same_as.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___CONCEPTS_SAME_AS_H #define _LIBCPP___CONCEPTS_SAME_AS_H -#include <__config> -#include <__type_traits/is_same.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_same.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/semiregular.h b/libcxx/include/__cxx03/__concepts/semiregular.h index 7a159d17dfc10..2a3eb3d667292 100644 --- a/libcxx/include/__cxx03/__concepts/semiregular.h +++ b/libcxx/include/__cxx03/__concepts/semiregular.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___CONCEPTS_SEMIREGULAR_H #define _LIBCPP___CONCEPTS_SEMIREGULAR_H -#include <__concepts/constructible.h> -#include <__concepts/copyable.h> -#include <__config> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__concepts/swappable.h b/libcxx/include/__cxx03/__concepts/swappable.h index d339488a087a5..333a42ad0c500 100644 --- a/libcxx/include/__cxx03/__concepts/swappable.h +++ b/libcxx/include/__cxx03/__concepts/swappable.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___CONCEPTS_SWAPPABLE_H #define _LIBCPP___CONCEPTS_SWAPPABLE_H -#include <__concepts/assignable.h> -#include <__concepts/class_or_enum.h> -#include <__concepts/common_reference_with.h> -#include <__concepts/constructible.h> -#include <__config> -#include <__type_traits/extent.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/exchange.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__concepts/common_reference_with.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/extent.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/exchange.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__concepts/totally_ordered.h b/libcxx/include/__cxx03/__concepts/totally_ordered.h index 186c3b430dd54..f35502b58f125 100644 --- a/libcxx/include/__cxx03/__concepts/totally_ordered.h +++ b/libcxx/include/__cxx03/__concepts/totally_ordered.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___CONCEPTS_TOTALLY_ORDERED_H #define _LIBCPP___CONCEPTS_TOTALLY_ORDERED_H -#include <__concepts/boolean_testable.h> -#include <__concepts/equality_comparable.h> -#include <__config> -#include <__type_traits/common_reference.h> -#include <__type_traits/make_const_lvalue_ref.h> +#include <__cxx03/__concepts/boolean_testable.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_reference.h> +#include <__cxx03/__type_traits/make_const_lvalue_ref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__condition_variable/condition_variable.h b/libcxx/include/__cxx03/__condition_variable/condition_variable.h index de35aaca1070e..d96c6928fa7be 100644 --- a/libcxx/include/__cxx03/__condition_variable/condition_variable.h +++ b/libcxx/include/__cxx03/__condition_variable/condition_variable.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___CONDITION_VARIABLE_CONDITION_VARIABLE_H #define _LIBCPP___CONDITION_VARIABLE_CONDITION_VARIABLE_H -#include <__chrono/duration.h> -#include <__chrono/steady_clock.h> -#include <__chrono/system_clock.h> -#include <__chrono/time_point.h> -#include <__config> -#include <__mutex/mutex.h> -#include <__mutex/unique_lock.h> -#include <__system_error/system_error.h> -#include <__thread/support.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_floating_point.h> -#include <__utility/move.h> -#include -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/steady_clock.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> +#include <__cxx03/__mutex/mutex.h> +#include <__cxx03/__mutex/unique_lock.h> +#include <__cxx03/__system_error/system_error.h> +#include <__cxx03/__thread/support.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/limits> +#include <__cxx03/ratio> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__config b/libcxx/include/__cxx03/__config index 661af5be3c225..6f77292387776 100644 --- a/libcxx/include/__cxx03/__config +++ b/libcxx/include/__cxx03/__config @@ -11,10 +11,10 @@ #define _LIBCPP___CONFIG #include <__config_site> -#include <__configuration/abi.h> -#include <__configuration/availability.h> -#include <__configuration/compiler.h> -#include <__configuration/platform.h> +#include <__cxx03/__configuration/abi.h> +#include <__cxx03/__configuration/availability.h> +#include <__cxx03/__configuration/compiler.h> +#include <__cxx03/__configuration/platform.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header @@ -230,7 +230,7 @@ _LIBCPP_HARDENING_MODE_DEBUG # endif # if defined(__MVS__) -# include // for __NATIVE_ASCII_F +# include <__cxx03/features.h> // for __NATIVE_ASCII_F # endif # if defined(_WIN32) diff --git a/libcxx/include/__cxx03/__configuration/abi.h b/libcxx/include/__cxx03/__configuration/abi.h index 0422b645727d8..0d11528877697 100644 --- a/libcxx/include/__cxx03/__configuration/abi.h +++ b/libcxx/include/__cxx03/__configuration/abi.h @@ -11,8 +11,8 @@ #define _LIBCPP___CONFIGURATION_ABI_H #include <__config_site> -#include <__configuration/compiler.h> -#include <__configuration/platform.h> +#include <__cxx03/__configuration/compiler.h> +#include <__cxx03/__configuration/platform.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__configuration/availability.h b/libcxx/include/__cxx03/__configuration/availability.h index ab483a07c9c13..e54ba50ae9a02 100644 --- a/libcxx/include/__cxx03/__configuration/availability.h +++ b/libcxx/include/__cxx03/__configuration/availability.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___CONFIGURATION_AVAILABILITY_H #define _LIBCPP___CONFIGURATION_AVAILABILITY_H -#include <__configuration/compiler.h> -#include <__configuration/language.h> +#include <__cxx03/__configuration/compiler.h> +#include <__cxx03/__configuration/language.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__coroutine/coroutine_handle.h b/libcxx/include/__cxx03/__coroutine/coroutine_handle.h index 4557a6643c239..183d113e2e69b 100644 --- a/libcxx/include/__cxx03/__coroutine/coroutine_handle.h +++ b/libcxx/include/__cxx03/__coroutine/coroutine_handle.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___COROUTINE_COROUTINE_HANDLE_H #define _LIBCPP___COROUTINE_COROUTINE_HANDLE_H -#include <__assert> -#include <__config> -#include <__functional/hash.h> -#include <__memory/addressof.h> -#include <__type_traits/remove_cv.h> -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/compare> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__coroutine/coroutine_traits.h b/libcxx/include/__cxx03/__coroutine/coroutine_traits.h index 78f05341f7486..b4fc2f45a1ac4 100644 --- a/libcxx/include/__cxx03/__coroutine/coroutine_traits.h +++ b/libcxx/include/__cxx03/__coroutine/coroutine_traits.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___COROUTINE_COROUTINE_TRAITS_H #define _LIBCPP___COROUTINE_COROUTINE_TRAITS_H -#include <__config> -#include <__type_traits/void_t.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/void_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__coroutine/noop_coroutine_handle.h b/libcxx/include/__cxx03/__coroutine/noop_coroutine_handle.h index da13d579604b5..2a6e25cde2f68 100644 --- a/libcxx/include/__cxx03/__coroutine/noop_coroutine_handle.h +++ b/libcxx/include/__cxx03/__coroutine/noop_coroutine_handle.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___COROUTINE_NOOP_COROUTINE_HANDLE_H #define _LIBCPP___COROUTINE_NOOP_COROUTINE_HANDLE_H -#include <__config> -#include <__coroutine/coroutine_handle.h> +#include <__cxx03/__config> +#include <__cxx03/__coroutine/coroutine_handle.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__coroutine/trivial_awaitables.h b/libcxx/include/__cxx03/__coroutine/trivial_awaitables.h index b604bd3c2d8ad..9420824b6cefc 100644 --- a/libcxx/include/__cxx03/__coroutine/trivial_awaitables.h +++ b/libcxx/include/__cxx03/__coroutine/trivial_awaitables.h @@ -9,8 +9,8 @@ #ifndef __LIBCPP___COROUTINE_TRIVIAL_AWAITABLES_H #define __LIBCPP___COROUTINE_TRIVIAL_AWAITABLES_H -#include <__config> -#include <__coroutine/coroutine_handle.h> +#include <__cxx03/__config> +#include <__cxx03/__coroutine/coroutine_handle.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__debug_utils/randomize_range.h b/libcxx/include/__cxx03/__debug_utils/randomize_range.h index 7eb77d81ab2a3..59e3a04f6d1ad 100644 --- a/libcxx/include/__cxx03/__debug_utils/randomize_range.h +++ b/libcxx/include/__cxx03/__debug_utils/randomize_range.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___LIBCXX_DEBUG_RANDOMIZE_RANGE_H #define _LIBCPP___LIBCXX_DEBUG_RANDOMIZE_RANGE_H -#include <__config> +#include <__cxx03/__config> #ifdef _LIBCPP_DEBUG_RANDOMIZE_UNSPECIFIED_STABILITY -# include <__algorithm/shuffle.h> -# include <__type_traits/is_constant_evaluated.h> +# include <__cxx03/__algorithm/shuffle.h> +# include <__cxx03/__type_traits/is_constant_evaluated.h> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/__debug_utils/sanitizers.h b/libcxx/include/__cxx03/__debug_utils/sanitizers.h index d8547e3249330..cd0caa9ed0a3f 100644 --- a/libcxx/include/__cxx03/__debug_utils/sanitizers.h +++ b/libcxx/include/__cxx03/__debug_utils/sanitizers.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___LIBCXX_DEBUG_UTILS_SANITIZERS_H #define _LIBCPP___LIBCXX_DEBUG_UTILS_SANITIZERS_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_constant_evaluated.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__debug_utils/strict_weak_ordering_check.h b/libcxx/include/__cxx03/__debug_utils/strict_weak_ordering_check.h index 3a9d887284164..6aec78e46ff89 100644 --- a/libcxx/include/__cxx03/__debug_utils/strict_weak_ordering_check.h +++ b/libcxx/include/__cxx03/__debug_utils/strict_weak_ordering_check.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___LIBCXX_DEBUG_STRICT_WEAK_ORDERING_CHECK #define _LIBCPP___LIBCXX_DEBUG_STRICT_WEAK_ORDERING_CHECK -#include <__config> +#include <__cxx03/__config> -#include <__algorithm/comp_ref_type.h> -#include <__algorithm/is_sorted.h> -#include <__assert> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_constant_evaluated.h> +#include <__cxx03/__algorithm/comp_ref_type.h> +#include <__cxx03/__algorithm/is_sorted.h> +#include <__cxx03/__assert> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__exception/exception.h b/libcxx/include/__cxx03/__exception/exception.h index e724e1b99bd14..8557cbd4945e4 100644 --- a/libcxx/include/__cxx03/__exception/exception.h +++ b/libcxx/include/__cxx03/__exception/exception.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___EXCEPTION_EXCEPTION_H #define _LIBCPP___EXCEPTION_EXCEPTION_H -#include <__config> +#include <__cxx03/__config> // defines its own std::exception and std::bad_exception types, // which we use in order to be ABI-compatible with other STLs on Windows. #if defined(_LIBCPP_ABI_VCRUNTIME) -# include +# include <__cxx03/vcruntime_exception.h> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/__exception/exception_ptr.h b/libcxx/include/__cxx03/__exception/exception_ptr.h index beadd9212abd1..6bdb47fd395f0 100644 --- a/libcxx/include/__cxx03/__exception/exception_ptr.h +++ b/libcxx/include/__cxx03/__exception/exception_ptr.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___EXCEPTION_EXCEPTION_PTR_H #define _LIBCPP___EXCEPTION_EXCEPTION_PTR_H -#include <__config> -#include <__exception/operations.h> -#include <__memory/addressof.h> -#include <__memory/construct_at.h> -#include <__type_traits/decay.h> -#include -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__exception/operations.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdlib> +#include <__cxx03/new> +#include <__cxx03/typeinfo> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__exception/nested_exception.h b/libcxx/include/__cxx03/__exception/nested_exception.h index feb489f87f62f..1b889e6bf5d87 100644 --- a/libcxx/include/__cxx03/__exception/nested_exception.h +++ b/libcxx/include/__cxx03/__exception/nested_exception.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___EXCEPTION_NESTED_EXCEPTION_H #define _LIBCPP___EXCEPTION_NESTED_EXCEPTION_H -#include <__config> -#include <__exception/exception_ptr.h> -#include <__memory/addressof.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_base_of.h> -#include <__type_traits/is_class.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_final.h> -#include <__type_traits/is_polymorphic.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__exception/exception_ptr.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_base_of.h> +#include <__cxx03/__type_traits/is_class.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_final.h> +#include <__cxx03/__type_traits/is_polymorphic.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__exception/operations.h b/libcxx/include/__cxx03/__exception/operations.h index 0a9c7a7c7f0d8..cb67194553d37 100644 --- a/libcxx/include/__cxx03/__exception/operations.h +++ b/libcxx/include/__cxx03/__exception/operations.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___EXCEPTION_OPERATIONS_H #define _LIBCPP___EXCEPTION_OPERATIONS_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__exception/terminate.h b/libcxx/include/__cxx03/__exception/terminate.h index e672471dc5263..78ccd16c91393 100644 --- a/libcxx/include/__cxx03/__exception/terminate.h +++ b/libcxx/include/__cxx03/__exception/terminate.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___EXCEPTION_TERMINATE_H #define _LIBCPP___EXCEPTION_TERMINATE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__expected/bad_expected_access.h b/libcxx/include/__cxx03/__expected/bad_expected_access.h index 1b734389e8311..4f7e33d2c6986 100644 --- a/libcxx/include/__cxx03/__expected/bad_expected_access.h +++ b/libcxx/include/__cxx03/__expected/bad_expected_access.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H #define _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H -#include <__config> -#include <__exception/exception.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__expected/expected.h b/libcxx/include/__cxx03/__expected/expected.h index 7a6f04c50dabf..adadea8e4b39c 100644 --- a/libcxx/include/__cxx03/__expected/expected.h +++ b/libcxx/include/__cxx03/__expected/expected.h @@ -9,49 +9,49 @@ #ifndef _LIBCPP___EXPECTED_EXPECTED_H #define _LIBCPP___EXPECTED_EXPECTED_H -#include <__assert> -#include <__config> -#include <__expected/bad_expected_access.h> -#include <__expected/unexpect.h> -#include <__expected/unexpected.h> -#include <__functional/invoke.h> -#include <__memory/addressof.h> -#include <__memory/construct_at.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/disjunction.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/is_trivially_constructible.h> -#include <__type_traits/is_trivially_destructible.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/is_void.h> -#include <__type_traits/lazy.h> -#include <__type_traits/negation.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/as_const.h> -#include <__utility/exception_guard.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include <__verbose_abort> -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__expected/bad_expected_access.h> +#include <__cxx03/__expected/unexpect.h> +#include <__cxx03/__expected/unexpected.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/disjunction.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/is_trivially_constructible.h> +#include <__cxx03/__type_traits/is_trivially_destructible.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/lazy.h> +#include <__cxx03/__type_traits/negation.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/as_const.h> +#include <__cxx03/__utility/exception_guard.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__expected/unexpect.h b/libcxx/include/__cxx03/__expected/unexpect.h index df52787d36faf..895f053a1e333 100644 --- a/libcxx/include/__cxx03/__expected/unexpect.h +++ b/libcxx/include/__cxx03/__expected/unexpect.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___EXPECTED_UNEXPECT_H #define _LIBCPP___EXPECTED_UNEXPECT_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__expected/unexpected.h b/libcxx/include/__cxx03/__expected/unexpected.h index c7fe3c52e4311..fe2ab407b4151 100644 --- a/libcxx/include/__cxx03/__expected/unexpected.h +++ b/libcxx/include/__cxx03/__expected/unexpected.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___EXPECTED_UNEXPECTED_H #define _LIBCPP___EXPECTED_UNEXPECTED_H -#include <__config> -#include <__type_traits/conjunction.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/is_volatile.h> -#include <__type_traits/negation.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/is_volatile.h> +#include <__cxx03/__type_traits/negation.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__filesystem/copy_options.h b/libcxx/include/__cxx03/__filesystem/copy_options.h index 097eebe61137d..c9adf3cd64eb4 100644 --- a/libcxx/include/__cxx03/__filesystem/copy_options.h +++ b/libcxx/include/__cxx03/__filesystem/copy_options.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___FILESYSTEM_COPY_OPTIONS_H #define _LIBCPP___FILESYSTEM_COPY_OPTIONS_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/directory_entry.h b/libcxx/include/__cxx03/__filesystem/directory_entry.h index 96d88dcd90b4c..a78f43d6b1c5c 100644 --- a/libcxx/include/__cxx03/__filesystem/directory_entry.h +++ b/libcxx/include/__cxx03/__filesystem/directory_entry.h @@ -10,28 +10,28 @@ #ifndef _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H #define _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H -#include <__chrono/time_point.h> -#include <__compare/ordering.h> -#include <__config> -#include <__filesystem/file_status.h> -#include <__filesystem/file_time_type.h> -#include <__filesystem/file_type.h> -#include <__filesystem/filesystem_error.h> -#include <__filesystem/operations.h> -#include <__filesystem/path.h> -#include <__filesystem/perms.h> -#include <__system_error/errc.h> -#include <__system_error/error_code.h> -#include <__utility/move.h> -#include <__utility/unreachable.h> -#include +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__filesystem/file_status.h> +#include <__cxx03/__filesystem/file_time_type.h> +#include <__cxx03/__filesystem/file_type.h> +#include <__cxx03/__filesystem/filesystem_error.h> +#include <__cxx03/__filesystem/operations.h> +#include <__cxx03/__filesystem/path.h> +#include <__cxx03/__filesystem/perms.h> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__system_error/error_code.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM) diff --git a/libcxx/include/__cxx03/__filesystem/directory_iterator.h b/libcxx/include/__cxx03/__filesystem/directory_iterator.h index e0246d8001e19..667007f5c2cdf 100644 --- a/libcxx/include/__cxx03/__filesystem/directory_iterator.h +++ b/libcxx/include/__cxx03/__filesystem/directory_iterator.h @@ -10,26 +10,26 @@ #ifndef _LIBCPP___FILESYSTEM_DIRECTORY_ITERATOR_H #define _LIBCPP___FILESYSTEM_DIRECTORY_ITERATOR_H -#include <__assert> -#include <__config> -#include <__filesystem/directory_entry.h> -#include <__filesystem/directory_options.h> -#include <__filesystem/path.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/iterator_traits.h> -#include <__memory/shared_ptr.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/enable_view.h> -#include <__system_error/error_code.h> -#include <__utility/move.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__filesystem/directory_entry.h> +#include <__cxx03/__filesystem/directory_options.h> +#include <__cxx03/__filesystem/path.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/enable_view.h> +#include <__cxx03/__system_error/error_code.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM) diff --git a/libcxx/include/__cxx03/__filesystem/directory_options.h b/libcxx/include/__cxx03/__filesystem/directory_options.h index d0cd3ebfdaa7e..eeb87936f2bd1 100644 --- a/libcxx/include/__cxx03/__filesystem/directory_options.h +++ b/libcxx/include/__cxx03/__filesystem/directory_options.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___FILESYSTEM_DIRECTORY_OPTIONS_H #define _LIBCPP___FILESYSTEM_DIRECTORY_OPTIONS_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/file_status.h b/libcxx/include/__cxx03/__filesystem/file_status.h index da316c8b02746..0022518842af7 100644 --- a/libcxx/include/__cxx03/__filesystem/file_status.h +++ b/libcxx/include/__cxx03/__filesystem/file_status.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___FILESYSTEM_FILE_STATUS_H #define _LIBCPP___FILESYSTEM_FILE_STATUS_H -#include <__config> -#include <__filesystem/file_type.h> -#include <__filesystem/perms.h> +#include <__cxx03/__config> +#include <__cxx03/__filesystem/file_type.h> +#include <__cxx03/__filesystem/perms.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/file_time_type.h b/libcxx/include/__cxx03/__filesystem/file_time_type.h index 63e4ae1578cfd..1e964bbe32522 100644 --- a/libcxx/include/__cxx03/__filesystem/file_time_type.h +++ b/libcxx/include/__cxx03/__filesystem/file_time_type.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___FILESYSTEM_FILE_TIME_TYPE_H #define _LIBCPP___FILESYSTEM_FILE_TIME_TYPE_H -#include <__chrono/file_clock.h> -#include <__chrono/time_point.h> -#include <__config> +#include <__cxx03/__chrono/file_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/file_type.h b/libcxx/include/__cxx03/__filesystem/file_type.h index e4ac1dfee9ed9..392ca2b57c26b 100644 --- a/libcxx/include/__cxx03/__filesystem/file_type.h +++ b/libcxx/include/__cxx03/__filesystem/file_type.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___FILESYSTEM_FILE_TYPE_H #define _LIBCPP___FILESYSTEM_FILE_TYPE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/filesystem_error.h b/libcxx/include/__cxx03/__filesystem/filesystem_error.h index 80a11e3b1932c..8b68bac2e36fa 100644 --- a/libcxx/include/__cxx03/__filesystem/filesystem_error.h +++ b/libcxx/include/__cxx03/__filesystem/filesystem_error.h @@ -10,14 +10,14 @@ #ifndef _LIBCPP___FILESYSTEM_FILESYSTEM_ERROR_H #define _LIBCPP___FILESYSTEM_FILESYSTEM_ERROR_H -#include <__config> -#include <__filesystem/path.h> -#include <__memory/shared_ptr.h> -#include <__system_error/error_code.h> -#include <__system_error/system_error.h> -#include <__utility/forward.h> -#include <__verbose_abort> -#include +#include <__cxx03/__config> +#include <__cxx03/__filesystem/path.h> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__system_error/error_code.h> +#include <__cxx03/__system_error/system_error.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/string> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/operations.h b/libcxx/include/__cxx03/__filesystem/operations.h index f588189ed1d9d..90b90bbaace93 100644 --- a/libcxx/include/__cxx03/__filesystem/operations.h +++ b/libcxx/include/__cxx03/__filesystem/operations.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___FILESYSTEM_OPERATIONS_H #define _LIBCPP___FILESYSTEM_OPERATIONS_H -#include <__chrono/time_point.h> -#include <__config> -#include <__filesystem/copy_options.h> -#include <__filesystem/file_status.h> -#include <__filesystem/file_time_type.h> -#include <__filesystem/file_type.h> -#include <__filesystem/path.h> -#include <__filesystem/perm_options.h> -#include <__filesystem/perms.h> -#include <__filesystem/space_info.h> -#include <__system_error/error_code.h> -#include +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> +#include <__cxx03/__filesystem/copy_options.h> +#include <__cxx03/__filesystem/file_status.h> +#include <__cxx03/__filesystem/file_time_type.h> +#include <__cxx03/__filesystem/file_type.h> +#include <__cxx03/__filesystem/path.h> +#include <__cxx03/__filesystem/perm_options.h> +#include <__cxx03/__filesystem/perms.h> +#include <__cxx03/__filesystem/space_info.h> +#include <__cxx03/__system_error/error_code.h> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/path.h b/libcxx/include/__cxx03/__filesystem/path.h index ff468d517722f..815d881d8a099 100644 --- a/libcxx/include/__cxx03/__filesystem/path.h +++ b/libcxx/include/__cxx03/__filesystem/path.h @@ -10,24 +10,24 @@ #ifndef _LIBCPP___FILESYSTEM_PATH_H #define _LIBCPP___FILESYSTEM_PATH_H -#include <__algorithm/replace.h> -#include <__algorithm/replace_copy.h> -#include <__config> -#include <__functional/unary_function.h> -#include <__fwd/functional.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_pointer.h> -#include -#include -#include +#include <__cxx03/__algorithm/replace.h> +#include <__cxx03/__algorithm/replace_copy.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/cstddef> +#include <__cxx03/string> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include // for quoted -# include +# include <__cxx03/iomanip> // for quoted +# include <__cxx03/locale> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -35,7 +35,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__cxx03/__filesystem/path_iterator.h b/libcxx/include/__cxx03/__filesystem/path_iterator.h index f4d486d86cf38..c439782a2a6af 100644 --- a/libcxx/include/__cxx03/__filesystem/path_iterator.h +++ b/libcxx/include/__cxx03/__filesystem/path_iterator.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___FILESYSTEM_PATH_ITERATOR_H #define _LIBCPP___FILESYSTEM_PATH_ITERATOR_H -#include <__assert> -#include <__config> -#include <__filesystem/path.h> -#include <__iterator/iterator_traits.h> -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__filesystem/path.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/cstddef> +#include <__cxx03/string> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/perm_options.h b/libcxx/include/__cxx03/__filesystem/perm_options.h index 64c16ee60a17d..bfd7941cf1003 100644 --- a/libcxx/include/__cxx03/__filesystem/perm_options.h +++ b/libcxx/include/__cxx03/__filesystem/perm_options.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___FILESYSTEM_PERM_OPTIONS_H #define _LIBCPP___FILESYSTEM_PERM_OPTIONS_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/perms.h b/libcxx/include/__cxx03/__filesystem/perms.h index 458f1e6e53483..d14023730938c 100644 --- a/libcxx/include/__cxx03/__filesystem/perms.h +++ b/libcxx/include/__cxx03/__filesystem/perms.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___FILESYSTEM_PERMS_H #define _LIBCPP___FILESYSTEM_PERMS_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/recursive_directory_iterator.h b/libcxx/include/__cxx03/__filesystem/recursive_directory_iterator.h index caa1396eb301f..43da731cc5b9c 100644 --- a/libcxx/include/__cxx03/__filesystem/recursive_directory_iterator.h +++ b/libcxx/include/__cxx03/__filesystem/recursive_directory_iterator.h @@ -10,25 +10,25 @@ #ifndef _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H #define _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H -#include <__config> -#include <__filesystem/directory_entry.h> -#include <__filesystem/directory_options.h> -#include <__filesystem/path.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/iterator_traits.h> -#include <__memory/shared_ptr.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/enable_view.h> -#include <__system_error/error_code.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__filesystem/directory_entry.h> +#include <__cxx03/__filesystem/directory_options.h> +#include <__cxx03/__filesystem/path.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/enable_view.h> +#include <__cxx03/__system_error/error_code.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 17 && !defined(_LIBCPP_HAS_NO_FILESYSTEM) diff --git a/libcxx/include/__cxx03/__filesystem/space_info.h b/libcxx/include/__cxx03/__filesystem/space_info.h index 3fa57d33096fc..8e2f260249793 100644 --- a/libcxx/include/__cxx03/__filesystem/space_info.h +++ b/libcxx/include/__cxx03/__filesystem/space_info.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FILESYSTEM_SPACE_INFO_H #define _LIBCPP___FILESYSTEM_SPACE_INFO_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__filesystem/u8path.h b/libcxx/include/__cxx03/__filesystem/u8path.h index dae5823128f02..7f1fe89bf2522 100644 --- a/libcxx/include/__cxx03/__filesystem/u8path.h +++ b/libcxx/include/__cxx03/__filesystem/u8path.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___FILESYSTEM_U8PATH_H #define _LIBCPP___FILESYSTEM_U8PATH_H -#include <__algorithm/unwrap_iter.h> -#include <__config> -#include <__filesystem/path.h> -#include +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__config> +#include <__cxx03/__filesystem/path.h> +#include <__cxx03/string> // Only required on Windows for __widen_from_utf8, and included conservatively // because it requires support for localization. #if defined(_LIBCPP_WIN32API) -# include +# include <__cxx03/locale> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/__format/buffer.h b/libcxx/include/__cxx03/__format/buffer.h index 8598f0a1c0395..d4162086b84f4 100644 --- a/libcxx/include/__cxx03/__format/buffer.h +++ b/libcxx/include/__cxx03/__format/buffer.h @@ -10,42 +10,42 @@ #ifndef _LIBCPP___FORMAT_BUFFER_H #define _LIBCPP___FORMAT_BUFFER_H -#include <__algorithm/copy_n.h> -#include <__algorithm/fill_n.h> -#include <__algorithm/max.h> -#include <__algorithm/min.h> -#include <__algorithm/ranges_copy_n.h> -#include <__algorithm/transform.h> -#include <__algorithm/unwrap_iter.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/enable_insertable.h> -#include <__format/format_to_n_result.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/wrap_iter.h> -#include <__memory/addressof.h> -#include <__memory/allocate_at_least.h> -#include <__memory/allocator_traits.h> -#include <__memory/construct_at.h> -#include <__memory/ranges_construct_at.h> -#include <__memory/uninitialized_algorithms.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/conditional.h> -#include <__utility/exception_guard.h> -#include <__utility/move.h> -#include -#include +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/ranges_copy_n.h> +#include <__cxx03/__algorithm/transform.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/enable_insertable.h> +#include <__cxx03/__format/format_to_n_result.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/wrap_iter.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocate_at_least.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/ranges_construct_at.h> +#include <__cxx03/__memory/uninitialized_algorithms.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__utility/exception_guard.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/concepts.h b/libcxx/include/__cxx03/__format/concepts.h index 13380e9b91aff..ffe937e207ffe 100644 --- a/libcxx/include/__cxx03/__format/concepts.h +++ b/libcxx/include/__cxx03/__format/concepts.h @@ -10,17 +10,17 @@ #ifndef _LIBCPP___FORMAT_CONCEPTS_H #define _LIBCPP___FORMAT_CONCEPTS_H -#include <__concepts/same_as.h> -#include <__concepts/semiregular.h> -#include <__config> -#include <__format/format_parse_context.h> -#include <__fwd/format.h> -#include <__fwd/tuple.h> -#include <__tuple/tuple_size.h> -#include <__type_traits/is_specialization.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_reference.h> -#include <__utility/pair.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__concepts/semiregular.h> +#include <__cxx03/__config> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__fwd/format.h> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__type_traits/is_specialization.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/container_adaptor.h b/libcxx/include/__cxx03/__format/container_adaptor.h index 9f49ca03bf4f5..5d47677d8d537 100644 --- a/libcxx/include/__cxx03/__format/container_adaptor.h +++ b/libcxx/include/__cxx03/__format/container_adaptor.h @@ -14,15 +14,15 @@ # pragma GCC system_header #endif -#include <__config> -#include <__format/concepts.h> -#include <__format/formatter.h> -#include <__format/range_default_formatter.h> -#include <__fwd/queue.h> -#include <__fwd/stack.h> -#include <__ranges/ref_view.h> -#include <__type_traits/is_const.h> -#include <__type_traits/maybe_const.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/range_default_formatter.h> +#include <__cxx03/__fwd/queue.h> +#include <__cxx03/__fwd/stack.h> +#include <__cxx03/__ranges/ref_view.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/maybe_const.h> _LIBCPP_BEGIN_NAMESPACE_STD @@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // formatter specializations, not which header should provide them. // Since includes a lot of headers, add these headers here instead of // adding more dependencies like, locale, optinal, string, tuple, etc. to the -// adaptor headers. To use the format functions users already include . +// adaptor headers. To use the format functions users already include <__cxx03/format>. template struct _LIBCPP_TEMPLATE_VIS __formatter_container_adaptor { diff --git a/libcxx/include/__cxx03/__format/enable_insertable.h b/libcxx/include/__cxx03/__format/enable_insertable.h index 86ef94a325b19..8bcae600a54ea 100644 --- a/libcxx/include/__cxx03/__format/enable_insertable.h +++ b/libcxx/include/__cxx03/__format/enable_insertable.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___FORMAT_ENABLE_INSERTABLE_H #define _LIBCPP___FORMAT_ENABLE_INSERTABLE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/escaped_output_table.h b/libcxx/include/__cxx03/__format/escaped_output_table.h index f7be2dc61f21a..0039968566f88 100644 --- a/libcxx/include/__cxx03/__format/escaped_output_table.h +++ b/libcxx/include/__cxx03/__format/escaped_output_table.h @@ -61,10 +61,10 @@ #ifndef _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H #define _LIBCPP___FORMAT_ESCAPED_OUTPUT_TABLE_H -#include <__algorithm/ranges_upper_bound.h> -#include <__config> -#include -#include +#include <__cxx03/__algorithm/ranges_upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/extended_grapheme_cluster_table.h b/libcxx/include/__cxx03/__format/extended_grapheme_cluster_table.h index 48581d8a5dde3..d83838bbc83a7 100644 --- a/libcxx/include/__cxx03/__format/extended_grapheme_cluster_table.h +++ b/libcxx/include/__cxx03/__format/extended_grapheme_cluster_table.h @@ -61,11 +61,11 @@ #ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H #define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H -#include <__algorithm/ranges_upper_bound.h> -#include <__config> -#include <__iterator/access.h> -#include -#include +#include <__cxx03/__algorithm/ranges_upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/format_arg.h b/libcxx/include/__cxx03/__format/format_arg.h index aa02f81dc40e2..3d37555c0e2df 100644 --- a/libcxx/include/__cxx03/__format/format_arg.h +++ b/libcxx/include/__cxx03/__format/format_arg.h @@ -10,29 +10,29 @@ #ifndef _LIBCPP___FORMAT_FORMAT_ARG_H #define _LIBCPP___FORMAT_FORMAT_ARG_H -#include <__assert> -#include <__concepts/arithmetic.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_parse_context.h> -#include <__functional/invoke.h> -#include <__fwd/format.h> -#include <__memory/addressof.h> -#include <__type_traits/conditional.h> -#include <__type_traits/remove_const.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/unreachable.h> -#include <__variant/monostate.h> -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__fwd/format.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/__variant/monostate.h> +#include <__cxx03/cstdint> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/format_arg_store.h b/libcxx/include/__cxx03/__format/format_arg_store.h index 23a599e995759..836a5a2ffc456 100644 --- a/libcxx/include/__cxx03/__format/format_arg_store.h +++ b/libcxx/include/__cxx03/__format/format_arg_store.h @@ -14,16 +14,16 @@ # pragma GCC system_header #endif -#include <__concepts/arithmetic.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_arg.h> -#include <__type_traits/conditional.h> -#include <__type_traits/extent.h> -#include <__type_traits/remove_const.h> -#include -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_arg.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/extent.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/string> +#include <__cxx03/string_view> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/format_args.h b/libcxx/include/__cxx03/__format/format_args.h index 07923570f3893..494ba91cf55ae 100644 --- a/libcxx/include/__cxx03/__format/format_args.h +++ b/libcxx/include/__cxx03/__format/format_args.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___FORMAT_FORMAT_ARGS_H #define _LIBCPP___FORMAT_FORMAT_ARGS_H -#include <__config> -#include <__format/format_arg.h> -#include <__format/format_arg_store.h> -#include <__fwd/format.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__format/format_arg.h> +#include <__cxx03/__format/format_arg_store.h> +#include <__cxx03/__fwd/format.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/format_context.h b/libcxx/include/__cxx03/__format/format_context.h index 20c07559eae44..04f2d69a247db 100644 --- a/libcxx/include/__cxx03/__format/format_context.h +++ b/libcxx/include/__cxx03/__format/format_context.h @@ -10,24 +10,24 @@ #ifndef _LIBCPP___FORMAT_FORMAT_CONTEXT_H #define _LIBCPP___FORMAT_FORMAT_CONTEXT_H -#include <__concepts/same_as.h> -#include <__config> -#include <__format/buffer.h> -#include <__format/format_arg.h> -#include <__format/format_arg_store.h> -#include <__format/format_args.h> -#include <__format/format_error.h> -#include <__fwd/format.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/concepts.h> -#include <__memory/addressof.h> -#include <__utility/move.h> -#include <__variant/monostate.h> -#include +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/buffer.h> +#include <__cxx03/__format/format_arg.h> +#include <__cxx03/__format/format_arg_store.h> +#include <__cxx03/__format/format_args.h> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__fwd/format.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__variant/monostate.h> +#include <__cxx03/cstddef> #ifndef _LIBCPP_HAS_NO_LOCALIZATION -# include <__locale> -# include +# include <__cxx03/__locale> +# include <__cxx03/optional> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -35,7 +35,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/format_error.h b/libcxx/include/__cxx03/__format/format_error.h index ed40e395d6af7..701bf3ad0dcc5 100644 --- a/libcxx/include/__cxx03/__format/format_error.h +++ b/libcxx/include/__cxx03/__format/format_error.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___FORMAT_FORMAT_ERROR_H #define _LIBCPP___FORMAT_FORMAT_ERROR_H -#include <__config> -#include <__verbose_abort> -#include +#include <__cxx03/__config> +#include <__cxx03/__verbose_abort> +#include <__cxx03/stdexcept> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/format_functions.h b/libcxx/include/__cxx03/__format/format_functions.h index d14b49aff1495..bc9056dd0bdeb 100644 --- a/libcxx/include/__cxx03/__format/format_functions.h +++ b/libcxx/include/__cxx03/__format/format_functions.h @@ -10,38 +10,38 @@ #ifndef _LIBCPP___FORMAT_FORMAT_FUNCTIONS #define _LIBCPP___FORMAT_FORMAT_FUNCTIONS -#include <__algorithm/clamp.h> -#include <__concepts/convertible_to.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/buffer.h> -#include <__format/format_arg.h> -#include <__format/format_arg_store.h> -#include <__format/format_args.h> -#include <__format/format_context.h> -#include <__format/format_error.h> -#include <__format/format_parse_context.h> -#include <__format/format_string.h> -#include <__format/format_to_n_result.h> -#include <__format/formatter.h> -#include <__format/formatter_bool.h> -#include <__format/formatter_char.h> -#include <__format/formatter_floating_point.h> -#include <__format/formatter_integer.h> -#include <__format/formatter_pointer.h> -#include <__format/formatter_string.h> -#include <__format/parser_std_format_spec.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> // iter_value_t -#include <__variant/monostate.h> -#include -#include -#include +#include <__cxx03/__algorithm/clamp.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/buffer.h> +#include <__cxx03/__format/format_arg.h> +#include <__cxx03/__format/format_arg_store.h> +#include <__cxx03/__format/format_args.h> +#include <__cxx03/__format/format_context.h> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/format_string.h> +#include <__cxx03/__format/format_to_n_result.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_bool.h> +#include <__cxx03/__format/formatter_char.h> +#include <__cxx03/__format/formatter_floating_point.h> +#include <__cxx03/__format/formatter_integer.h> +#include <__cxx03/__format/formatter_pointer.h> +#include <__cxx03/__format/formatter_string.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> // iter_value_t +#include <__cxx03/__variant/monostate.h> +#include <__cxx03/array> +#include <__cxx03/string> +#include <__cxx03/string_view> #ifndef _LIBCPP_HAS_NO_LOCALIZATION -# include <__locale> +# include <__cxx03/__locale> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -49,7 +49,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/format_parse_context.h b/libcxx/include/__cxx03/__format/format_parse_context.h index aefcd5497f3b9..1df545987659b 100644 --- a/libcxx/include/__cxx03/__format/format_parse_context.h +++ b/libcxx/include/__cxx03/__format/format_parse_context.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___FORMAT_FORMAT_PARSE_CONTEXT_H #define _LIBCPP___FORMAT_FORMAT_PARSE_CONTEXT_H -#include <__config> -#include <__format/format_error.h> -#include <__type_traits/is_constant_evaluated.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/format_string.h b/libcxx/include/__cxx03/__format/format_string.h index bdf3cff7f49b1..c48719337372d 100644 --- a/libcxx/include/__cxx03/__format/format_string.h +++ b/libcxx/include/__cxx03/__format/format_string.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___FORMAT_FORMAT_STRING_H #define _LIBCPP___FORMAT_FORMAT_STRING_H -#include <__assert> -#include <__config> -#include <__format/format_error.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> // iter_value_t -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> // iter_value_t +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/format_to_n_result.h b/libcxx/include/__cxx03/__format/format_to_n_result.h index 6f30546dec081..8797432922eb9 100644 --- a/libcxx/include/__cxx03/__format/format_to_n_result.h +++ b/libcxx/include/__cxx03/__format/format_to_n_result.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FORMAT_FORMAT_TO_N_RESULT_H #define _LIBCPP___FORMAT_FORMAT_TO_N_RESULT_H -#include <__config> -#include <__iterator/incrementable_traits.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/incrementable_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/formatter.h b/libcxx/include/__cxx03/__format/formatter.h index e2f418f936ee1..98457d0a2c65e 100644 --- a/libcxx/include/__cxx03/__format/formatter.h +++ b/libcxx/include/__cxx03/__format/formatter.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_H #define _LIBCPP___FORMAT_FORMATTER_H -#include <__config> -#include <__fwd/format.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/format.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/formatter_bool.h b/libcxx/include/__cxx03/__format/formatter_bool.h index 17dc69541e8fe..dd8e97bfc7f48 100644 --- a/libcxx/include/__cxx03/__format/formatter_bool.h +++ b/libcxx/include/__cxx03/__format/formatter_bool.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_BOOL_H #define _LIBCPP___FORMAT_FORMATTER_BOOL_H -#include <__algorithm/copy.h> -#include <__assert> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/formatter_integral.h> -#include <__format/parser_std_format_spec.h> -#include <__utility/unreachable.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_integral.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__utility/unreachable.h> #ifndef _LIBCPP_HAS_NO_LOCALIZATION -# include <__locale> +# include <__cxx03/__locale> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/__format/formatter_char.h b/libcxx/include/__cxx03/__format/formatter_char.h index d33e84368a765..b743433c2891d 100644 --- a/libcxx/include/__cxx03/__format/formatter_char.h +++ b/libcxx/include/__cxx03/__format/formatter_char.h @@ -10,17 +10,17 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_CHAR_H #define _LIBCPP___FORMAT_FORMATTER_CHAR_H -#include <__concepts/same_as.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/formatter_integral.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include <__format/write_escaped.h> -#include <__type_traits/conditional.h> -#include <__type_traits/make_unsigned.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_integral.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__format/write_escaped.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/make_unsigned.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/formatter_floating_point.h b/libcxx/include/__cxx03/__format/formatter_floating_point.h index fa42ba203b0b5..36663fb90716b 100644 --- a/libcxx/include/__cxx03/__format/formatter_floating_point.h +++ b/libcxx/include/__cxx03/__format/formatter_floating_point.h @@ -10,36 +10,36 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_FLOATING_POINT_H #define _LIBCPP___FORMAT_FORMATTER_FLOATING_POINT_H -#include <__algorithm/copy_n.h> -#include <__algorithm/find.h> -#include <__algorithm/max.h> -#include <__algorithm/min.h> -#include <__algorithm/rotate.h> -#include <__algorithm/transform.h> -#include <__assert> -#include <__charconv/chars_format.h> -#include <__charconv/to_chars_floating_point.h> -#include <__charconv/to_chars_result.h> -#include <__concepts/arithmetic.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/formatter_integral.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include <__iterator/concepts.h> -#include <__memory/allocator.h> -#include <__system_error/errc.h> -#include <__type_traits/conditional.h> -#include <__utility/move.h> -#include <__utility/unreachable.h> -#include -#include +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__algorithm/find.h> +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/rotate.h> +#include <__cxx03/__algorithm/transform.h> +#include <__cxx03/__assert> +#include <__cxx03/__charconv/chars_format.h> +#include <__cxx03/__charconv/to_chars_floating_point.h> +#include <__cxx03/__charconv/to_chars_result.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_integral.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/cmath> +#include <__cxx03/cstddef> #ifndef _LIBCPP_HAS_NO_LOCALIZATION -# include <__locale> +# include <__cxx03/__locale> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -47,7 +47,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/formatter_integer.h b/libcxx/include/__cxx03/__format/formatter_integer.h index 41400f00478eb..0d4dd15de4f9d 100644 --- a/libcxx/include/__cxx03/__format/formatter_integer.h +++ b/libcxx/include/__cxx03/__format/formatter_integer.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_INTEGER_H #define _LIBCPP___FORMAT_FORMATTER_INTEGER_H -#include <__concepts/arithmetic.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/formatter_integral.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include <__type_traits/is_void.h> -#include <__type_traits/make_32_64_or_128_bit.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_integral.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/make_32_64_or_128_bit.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/formatter_integral.h b/libcxx/include/__cxx03/__format/formatter_integral.h index eca966f8886f8..6bab831244498 100644 --- a/libcxx/include/__cxx03/__format/formatter_integral.h +++ b/libcxx/include/__cxx03/__format/formatter_integral.h @@ -10,29 +10,29 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_INTEGRAL_H #define _LIBCPP___FORMAT_FORMATTER_INTEGRAL_H -#include <__charconv/to_chars_integral.h> -#include <__charconv/to_chars_result.h> -#include <__charconv/traits.h> -#include <__concepts/arithmetic.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_error.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__memory/pointer_traits.h> -#include <__system_error/errc.h> -#include <__type_traits/make_unsigned.h> -#include <__utility/unreachable.h> -#include -#include -#include -#include +#include <__cxx03/__charconv/to_chars_integral.h> +#include <__cxx03/__charconv/to_chars_result.h> +#include <__cxx03/__charconv/traits.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/array> +#include <__cxx03/limits> +#include <__cxx03/string> +#include <__cxx03/string_view> #ifndef _LIBCPP_HAS_NO_LOCALIZATION -# include <__locale> +# include <__cxx03/__locale> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -40,7 +40,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/formatter_output.h b/libcxx/include/__cxx03/__format/formatter_output.h index 1498f64c4aeff..edf6909de750a 100644 --- a/libcxx/include/__cxx03/__format/formatter_output.h +++ b/libcxx/include/__cxx03/__format/formatter_output.h @@ -10,33 +10,33 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_OUTPUT_H #define _LIBCPP___FORMAT_FORMATTER_OUTPUT_H -#include <__algorithm/ranges_copy.h> -#include <__algorithm/ranges_fill_n.h> -#include <__algorithm/ranges_transform.h> -#include <__bit/countl.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/buffer.h> -#include <__format/concepts.h> -#include <__format/formatter.h> -#include <__format/parser_std_format_spec.h> -#include <__format/unicode.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__memory/pointer_traits.h> -#include <__utility/move.h> -#include <__utility/unreachable.h> -#include -#include +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__algorithm/ranges_fill_n.h> +#include <__cxx03/__algorithm/ranges_transform.h> +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/buffer.h> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__format/unicode.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/cstddef> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/formatter_pointer.h b/libcxx/include/__cxx03/__format/formatter_pointer.h index 6941343efd91f..166a2e93b622f 100644 --- a/libcxx/include/__cxx03/__format/formatter_pointer.h +++ b/libcxx/include/__cxx03/__format/formatter_pointer.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_POINTER_H #define _LIBCPP___FORMAT_FORMATTER_POINTER_H -#include <__config> -#include <__format/concepts.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/formatter_integral.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_integral.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/formatter_string.h b/libcxx/include/__cxx03/__format/formatter_string.h index 347439fc8dff1..9cc862a4545c7 100644 --- a/libcxx/include/__cxx03/__format/formatter_string.h +++ b/libcxx/include/__cxx03/__format/formatter_string.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_STRING_H #define _LIBCPP___FORMAT_FORMATTER_STRING_H -#include <__config> -#include <__format/concepts.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include <__format/write_escaped.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__format/write_escaped.h> +#include <__cxx03/string> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/formatter_tuple.h b/libcxx/include/__cxx03/__format/formatter_tuple.h index 030097a8797da..911552f663505 100644 --- a/libcxx/include/__cxx03/__format/formatter_tuple.h +++ b/libcxx/include/__cxx03/__format/formatter_tuple.h @@ -10,22 +10,22 @@ #ifndef _LIBCPP___FORMAT_FORMATTER_TUPLE_H #define _LIBCPP___FORMAT_FORMATTER_TUPLE_H -#include <__algorithm/ranges_copy.h> -#include <__chrono/statically_widen.h> -#include <__config> -#include <__format/buffer.h> -#include <__format/concepts.h> -#include <__format/format_context.h> -#include <__format/format_error.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/integer_sequence.h> -#include <__utility/pair.h> -#include -#include +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__chrono/statically_widen.h> +#include <__cxx03/__config> +#include <__cxx03/__format/buffer.h> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_context.h> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/string_view> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/indic_conjunct_break_table.h b/libcxx/include/__cxx03/__format/indic_conjunct_break_table.h index 44521d27498c3..b02c2c324573c 100644 --- a/libcxx/include/__cxx03/__format/indic_conjunct_break_table.h +++ b/libcxx/include/__cxx03/__format/indic_conjunct_break_table.h @@ -61,11 +61,11 @@ #ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H #define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H -#include <__algorithm/ranges_upper_bound.h> -#include <__config> -#include <__iterator/access.h> -#include -#include +#include <__cxx03/__algorithm/ranges_upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/parser_std_format_spec.h b/libcxx/include/__cxx03/__format/parser_std_format_spec.h index 150bdde89f3b3..1afb5b2c5dd14 100644 --- a/libcxx/include/__cxx03/__format/parser_std_format_spec.h +++ b/libcxx/include/__cxx03/__format/parser_std_format_spec.h @@ -16,35 +16,35 @@ /// This header has some support for the chrono-format-spec since it doesn't /// affect the std-format-spec. -#include <__algorithm/copy_n.h> -#include <__algorithm/min.h> -#include <__assert> -#include <__concepts/arithmetic.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/format_arg.h> -#include <__format/format_error.h> -#include <__format/format_parse_context.h> -#include <__format/format_string.h> -#include <__format/unicode.h> -#include <__format/width_estimation_table.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> // iter_value_t -#include <__memory/addressof.h> -#include <__type_traits/common_type.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_trivially_copyable.h> -#include <__variant/monostate.h> -#include -#include -#include +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/format_arg.h> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/format_string.h> +#include <__cxx03/__format/unicode.h> +#include <__cxx03/__format/width_estimation_table.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> // iter_value_t +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/__variant/monostate.h> +#include <__cxx03/cstdint> +#include <__cxx03/string> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/range_default_formatter.h b/libcxx/include/__cxx03/__format/range_default_formatter.h index b35223ae93329..c0e07870430e8 100644 --- a/libcxx/include/__cxx03/__format/range_default_formatter.h +++ b/libcxx/include/__cxx03/__format/range_default_formatter.h @@ -14,22 +14,22 @@ # pragma GCC system_header #endif -#include <__algorithm/ranges_copy.h> -#include <__chrono/statically_widen.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/formatter.h> -#include <__format/range_formatter.h> -#include <__iterator/back_insert_iterator.h> -#include <__ranges/concepts.h> -#include <__ranges/data.h> -#include <__ranges/from_range.h> -#include <__ranges/size.h> -#include <__type_traits/conditional.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__chrono/statically_widen.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/range_formatter.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/data.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/string_view> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/range_formatter.h b/libcxx/include/__cxx03/__format/range_formatter.h index 6915630743493..57bb5ff726ba6 100644 --- a/libcxx/include/__cxx03/__format/range_formatter.h +++ b/libcxx/include/__cxx03/__format/range_formatter.h @@ -14,24 +14,24 @@ # pragma GCC system_header #endif -#include <__algorithm/ranges_copy.h> -#include <__chrono/statically_widen.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/buffer.h> -#include <__format/concepts.h> -#include <__format/format_context.h> -#include <__format/format_error.h> -#include <__format/formatter.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include <__iterator/back_insert_iterator.h> -#include <__ranges/concepts.h> -#include <__ranges/data.h> -#include <__ranges/from_range.h> -#include <__ranges/size.h> -#include <__type_traits/remove_cvref.h> -#include +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__chrono/statically_widen.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/buffer.h> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_context.h> +#include <__cxx03/__format/format_error.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/data.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/string_view> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__format/unicode.h b/libcxx/include/__cxx03/__format/unicode.h index de7d0fea1df56..5cf1a8967ab97 100644 --- a/libcxx/include/__cxx03/__format/unicode.h +++ b/libcxx/include/__cxx03/__format/unicode.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___FORMAT_UNICODE_H #define _LIBCPP___FORMAT_UNICODE_H -#include <__assert> -#include <__bit/countl.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__format/extended_grapheme_cluster_table.h> -#include <__format/indic_conjunct_break_table.h> -#include <__iterator/concepts.h> -#include <__iterator/readable_traits.h> // iter_value_t -#include <__utility/unreachable.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__format/extended_grapheme_cluster_table.h> +#include <__cxx03/__format/indic_conjunct_break_table.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/readable_traits.h> // iter_value_t +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/width_estimation_table.h b/libcxx/include/__cxx03/__format/width_estimation_table.h index 11f61dea18d69..2f0367d1c8f98 100644 --- a/libcxx/include/__cxx03/__format/width_estimation_table.h +++ b/libcxx/include/__cxx03/__format/width_estimation_table.h @@ -61,10 +61,10 @@ #ifndef _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H #define _LIBCPP___FORMAT_WIDTH_ESTIMATION_TABLE_H -#include <__algorithm/ranges_upper_bound.h> -#include <__config> -#include -#include +#include <__cxx03/__algorithm/ranges_upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__format/write_escaped.h b/libcxx/include/__cxx03/__format/write_escaped.h index 052ea98c3c3b8..82e65ca988ebf 100644 --- a/libcxx/include/__cxx03/__format/write_escaped.h +++ b/libcxx/include/__cxx03/__format/write_escaped.h @@ -10,28 +10,28 @@ #ifndef _LIBCPP___FORMAT_WRITE_ESCAPED_H #define _LIBCPP___FORMAT_WRITE_ESCAPED_H -#include <__algorithm/ranges_copy.h> -#include <__algorithm/ranges_for_each.h> -#include <__charconv/to_chars_integral.h> -#include <__charconv/to_chars_result.h> -#include <__chrono/statically_widen.h> -#include <__format/escaped_output_table.h> -#include <__format/formatter_output.h> -#include <__format/parser_std_format_spec.h> -#include <__format/unicode.h> -#include <__iterator/back_insert_iterator.h> -#include <__memory/addressof.h> -#include <__system_error/errc.h> -#include <__type_traits/make_unsigned.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__algorithm/ranges_for_each.h> +#include <__cxx03/__charconv/to_chars_integral.h> +#include <__cxx03/__charconv/to_chars_result.h> +#include <__cxx03/__chrono/statically_widen.h> +#include <__cxx03/__format/escaped_output_table.h> +#include <__cxx03/__format/formatter_output.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__format/unicode.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__functional/binary_function.h b/libcxx/include/__cxx03/__functional/binary_function.h index ddee3b170311f..0e54bc5e3c611 100644 --- a/libcxx/include/__cxx03/__functional/binary_function.h +++ b/libcxx/include/__cxx03/__functional/binary_function.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___FUNCTIONAL_BINARY_FUNCTION_H #define _LIBCPP___FUNCTIONAL_BINARY_FUNCTION_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/binary_negate.h b/libcxx/include/__cxx03/__functional/binary_negate.h index ce52b5ae9fc49..60dfb19918656 100644 --- a/libcxx/include/__cxx03/__functional/binary_negate.h +++ b/libcxx/include/__cxx03/__functional/binary_negate.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FUNCTIONAL_BINARY_NEGATE_H #define _LIBCPP___FUNCTIONAL_BINARY_NEGATE_H -#include <__config> -#include <__functional/binary_function.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/binary_function.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/bind.h b/libcxx/include/__cxx03/__functional/bind.h index b4f46441da507..5a1a76a6f1f7a 100644 --- a/libcxx/include/__cxx03/__functional/bind.h +++ b/libcxx/include/__cxx03/__functional/bind.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___FUNCTIONAL_BIND_H #define _LIBCPP___FUNCTIONAL_BIND_H -#include <__config> -#include <__functional/invoke.h> -#include <__functional/weak_result_type.h> -#include <__fwd/functional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_reference_wrapper.h> -#include <__type_traits/is_void.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/weak_result_type.h> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_reference_wrapper.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/cstddef> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/bind_back.h b/libcxx/include/__cxx03/__functional/bind_back.h index e44768d2283c0..fef99f1852377 100644 --- a/libcxx/include/__cxx03/__functional/bind_back.h +++ b/libcxx/include/__cxx03/__functional/bind_back.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___FUNCTIONAL_BIND_BACK_H #define _LIBCPP___FUNCTIONAL_BIND_BACK_H -#include <__config> -#include <__functional/invoke.h> -#include <__functional/perfect_forward.h> -#include <__type_traits/decay.h> -#include <__utility/forward.h> -#include <__utility/integer_sequence.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/perfect_forward.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/bind_front.h b/libcxx/include/__cxx03/__functional/bind_front.h index 87ef3affe80b6..1156f53be096e 100644 --- a/libcxx/include/__cxx03/__functional/bind_front.h +++ b/libcxx/include/__cxx03/__functional/bind_front.h @@ -10,14 +10,14 @@ #ifndef _LIBCPP___FUNCTIONAL_BIND_FRONT_H #define _LIBCPP___FUNCTIONAL_BIND_FRONT_H -#include <__config> -#include <__functional/invoke.h> -#include <__functional/perfect_forward.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/decay.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_constructible.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/perfect_forward.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/binder1st.h b/libcxx/include/__cxx03/__functional/binder1st.h index 04b51fefab70a..1f2f7ebb9c853 100644 --- a/libcxx/include/__cxx03/__functional/binder1st.h +++ b/libcxx/include/__cxx03/__functional/binder1st.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FUNCTIONAL_BINDER1ST_H #define _LIBCPP___FUNCTIONAL_BINDER1ST_H -#include <__config> -#include <__functional/unary_function.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/unary_function.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/binder2nd.h b/libcxx/include/__cxx03/__functional/binder2nd.h index 9d22e4430b1b3..d70fab8c93b4a 100644 --- a/libcxx/include/__cxx03/__functional/binder2nd.h +++ b/libcxx/include/__cxx03/__functional/binder2nd.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FUNCTIONAL_BINDER2ND_H #define _LIBCPP___FUNCTIONAL_BINDER2ND_H -#include <__config> -#include <__functional/unary_function.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/unary_function.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/boyer_moore_searcher.h b/libcxx/include/__cxx03/__functional/boyer_moore_searcher.h index 648b60c505219..e7724ce6f44c0 100644 --- a/libcxx/include/__cxx03/__functional/boyer_moore_searcher.h +++ b/libcxx/include/__cxx03/__functional/boyer_moore_searcher.h @@ -13,23 +13,23 @@ # pragma GCC system_header #endif -#include <__algorithm/fill_n.h> -#include <__config> -#include <__functional/hash.h> -#include <__functional/operations.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__memory/shared_ptr.h> -#include <__type_traits/make_unsigned.h> -#include <__utility/pair.h> -#include -#include -#include +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/array> +#include <__cxx03/unordered_map> +#include <__cxx03/vector> #if _LIBCPP_STD_VER >= 17 _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__functional/compose.h b/libcxx/include/__cxx03/__functional/compose.h index 4b86dd37cd48a..504a7e1ca1426 100644 --- a/libcxx/include/__cxx03/__functional/compose.h +++ b/libcxx/include/__cxx03/__functional/compose.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___FUNCTIONAL_COMPOSE_H #define _LIBCPP___FUNCTIONAL_COMPOSE_H -#include <__config> -#include <__functional/invoke.h> -#include <__functional/perfect_forward.h> -#include <__type_traits/decay.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/perfect_forward.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/default_searcher.h b/libcxx/include/__cxx03/__functional/default_searcher.h index db89d10757c1b..90ee48fc5e93b 100644 --- a/libcxx/include/__cxx03/__functional/default_searcher.h +++ b/libcxx/include/__cxx03/__functional/default_searcher.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___FUNCTIONAL_DEFAULT_SEARCHER_H #define _LIBCPP___FUNCTIONAL_DEFAULT_SEARCHER_H -#include <__algorithm/search.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/operations.h> -#include <__iterator/iterator_traits.h> -#include <__utility/pair.h> +#include <__cxx03/__algorithm/search.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/pair.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/function.h b/libcxx/include/__cxx03/__functional/function.h index c7b98035e34bf..fe3cc8accac63 100644 --- a/libcxx/include/__cxx03/__functional/function.h +++ b/libcxx/include/__cxx03/__functional/function.h @@ -10,43 +10,43 @@ #ifndef _LIBCPP___FUNCTIONAL_FUNCTION_H #define _LIBCPP___FUNCTIONAL_FUNCTION_H -#include <__assert> -#include <__config> -#include <__exception/exception.h> -#include <__functional/binary_function.h> -#include <__functional/invoke.h> -#include <__functional/unary_function.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__memory/allocator.h> -#include <__memory/allocator_destructor.h> -#include <__memory/allocator_traits.h> -#include <__memory/builtin_new_allocator.h> -#include <__memory/compressed_pair.h> -#include <__memory/unique_ptr.h> -#include <__type_traits/aligned_storage.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_core_convertible.h> -#include <__type_traits/is_scalar.h> -#include <__type_traits/is_trivially_constructible.h> -#include <__type_traits/is_trivially_destructible.h> -#include <__type_traits/is_void.h> -#include <__type_traits/strip_signature.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/piecewise_construct.h> -#include <__utility/swap.h> -#include <__verbose_abort> -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__functional/binary_function.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/allocator_destructor.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/builtin_new_allocator.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__type_traits/aligned_storage.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_core_convertible.h> +#include <__cxx03/__type_traits/is_scalar.h> +#include <__cxx03/__type_traits/is_trivially_constructible.h> +#include <__cxx03/__type_traits/is_trivially_destructible.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/strip_signature.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/piecewise_construct.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/new> +#include <__cxx03/tuple> +#include <__cxx03/typeinfo> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #ifndef _LIBCPP_CXX03_LANG diff --git a/libcxx/include/__cxx03/__functional/hash.h b/libcxx/include/__cxx03/__functional/hash.h index a9e450edd39f5..216d2ae4d1bc5 100644 --- a/libcxx/include/__cxx03/__functional/hash.h +++ b/libcxx/include/__cxx03/__functional/hash.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___FUNCTIONAL_HASH_H #define _LIBCPP___FUNCTIONAL_HASH_H -#include <__config> -#include <__functional/unary_function.h> -#include <__fwd/functional.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/invoke.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_enum.h> -#include <__type_traits/underlying_type.h> -#include <__utility/pair.h> -#include <__utility/swap.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/invoke.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_enum.h> +#include <__cxx03/__type_traits/underlying_type.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/cstring> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/identity.h b/libcxx/include/__cxx03/__functional/identity.h index 8468de3dae26c..7d016af84f854 100644 --- a/libcxx/include/__cxx03/__functional/identity.h +++ b/libcxx/include/__cxx03/__functional/identity.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___FUNCTIONAL_IDENTITY_H #define _LIBCPP___FUNCTIONAL_IDENTITY_H -#include <__config> -#include <__fwd/functional.h> -#include <__type_traits/integral_constant.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/invoke.h b/libcxx/include/__cxx03/__functional/invoke.h index ef4bf25f07759..a9c039bc0379b 100644 --- a/libcxx/include/__cxx03/__functional/invoke.h +++ b/libcxx/include/__cxx03/__functional/invoke.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___FUNCTIONAL_INVOKE_H #define _LIBCPP___FUNCTIONAL_INVOKE_H -#include <__config> -#include <__type_traits/invoke.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/invoke.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/is_transparent.h b/libcxx/include/__cxx03/__functional/is_transparent.h index b2d62f2e3ead8..de978871404b8 100644 --- a/libcxx/include/__cxx03/__functional/is_transparent.h +++ b/libcxx/include/__cxx03/__functional/is_transparent.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FUNCTIONAL_IS_TRANSPARENT #define _LIBCPP___FUNCTIONAL_IS_TRANSPARENT -#include <__config> -#include <__type_traits/void_t.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/void_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/mem_fn.h b/libcxx/include/__cxx03/__functional/mem_fn.h index ee07a71774f9a..31bf5a86bffdc 100644 --- a/libcxx/include/__cxx03/__functional/mem_fn.h +++ b/libcxx/include/__cxx03/__functional/mem_fn.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___FUNCTIONAL_MEM_FN_H #define _LIBCPP___FUNCTIONAL_MEM_FN_H -#include <__config> -#include <__functional/binary_function.h> -#include <__functional/invoke.h> -#include <__functional/weak_result_type.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/binary_function.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/weak_result_type.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/mem_fun_ref.h b/libcxx/include/__cxx03/__functional/mem_fun_ref.h index c344420b0299e..e0e2f29b95d16 100644 --- a/libcxx/include/__cxx03/__functional/mem_fun_ref.h +++ b/libcxx/include/__cxx03/__functional/mem_fun_ref.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___FUNCTIONAL_MEM_FUN_REF_H #define _LIBCPP___FUNCTIONAL_MEM_FUN_REF_H -#include <__config> -#include <__functional/binary_function.h> -#include <__functional/unary_function.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/binary_function.h> +#include <__cxx03/__functional/unary_function.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/not_fn.h b/libcxx/include/__cxx03/__functional/not_fn.h index 4b3ce5524a743..8b9264af64c58 100644 --- a/libcxx/include/__cxx03/__functional/not_fn.h +++ b/libcxx/include/__cxx03/__functional/not_fn.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___FUNCTIONAL_NOT_FN_H #define _LIBCPP___FUNCTIONAL_NOT_FN_H -#include <__config> -#include <__functional/invoke.h> -#include <__functional/perfect_forward.h> -#include <__type_traits/decay.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_constructible.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/perfect_forward.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/operations.h b/libcxx/include/__cxx03/__functional/operations.h index 0a6320f19de3f..c05579e35e835 100644 --- a/libcxx/include/__cxx03/__functional/operations.h +++ b/libcxx/include/__cxx03/__functional/operations.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___FUNCTIONAL_OPERATIONS_H #define _LIBCPP___FUNCTIONAL_OPERATIONS_H -#include <__config> -#include <__functional/binary_function.h> -#include <__functional/unary_function.h> -#include <__type_traits/desugars_to.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/binary_function.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__type_traits/desugars_to.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/perfect_forward.h b/libcxx/include/__cxx03/__functional/perfect_forward.h index 74177c789b4ad..45ecb68250dc9 100644 --- a/libcxx/include/__cxx03/__functional/perfect_forward.h +++ b/libcxx/include/__cxx03/__functional/perfect_forward.h @@ -10,22 +10,22 @@ #ifndef _LIBCPP___FUNCTIONAL_PERFECT_FORWARD_H #define _LIBCPP___FUNCTIONAL_PERFECT_FORWARD_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/invoke.h> -#include <__type_traits/is_constructible.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/integer_sequence.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/invoke.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__functional/pointer_to_binary_function.h b/libcxx/include/__cxx03/__functional/pointer_to_binary_function.h index e345250dcdd87..d033f9050f0c3 100644 --- a/libcxx/include/__cxx03/__functional/pointer_to_binary_function.h +++ b/libcxx/include/__cxx03/__functional/pointer_to_binary_function.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FUNCTIONAL_POINTER_TO_BINARY_FUNCTION_H #define _LIBCPP___FUNCTIONAL_POINTER_TO_BINARY_FUNCTION_H -#include <__config> -#include <__functional/binary_function.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/binary_function.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/pointer_to_unary_function.h b/libcxx/include/__cxx03/__functional/pointer_to_unary_function.h index 3a5d153d36178..9e7f62915c91f 100644 --- a/libcxx/include/__cxx03/__functional/pointer_to_unary_function.h +++ b/libcxx/include/__cxx03/__functional/pointer_to_unary_function.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FUNCTIONAL_POINTER_TO_UNARY_FUNCTION_H #define _LIBCPP___FUNCTIONAL_POINTER_TO_UNARY_FUNCTION_H -#include <__config> -#include <__functional/unary_function.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/unary_function.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/ranges_operations.h b/libcxx/include/__cxx03/__functional/ranges_operations.h index 27f06eadd0eb1..e8a2b367c9350 100644 --- a/libcxx/include/__cxx03/__functional/ranges_operations.h +++ b/libcxx/include/__cxx03/__functional/ranges_operations.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___FUNCTIONAL_RANGES_OPERATIONS_H #define _LIBCPP___FUNCTIONAL_RANGES_OPERATIONS_H -#include <__concepts/equality_comparable.h> -#include <__concepts/totally_ordered.h> -#include <__config> -#include <__type_traits/desugars_to.h> -#include <__utility/forward.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__concepts/totally_ordered.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/desugars_to.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/reference_wrapper.h b/libcxx/include/__cxx03/__functional/reference_wrapper.h index 3570e2673c800..73788e1fdb89b 100644 --- a/libcxx/include/__cxx03/__functional/reference_wrapper.h +++ b/libcxx/include/__cxx03/__functional/reference_wrapper.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___FUNCTIONAL_REFERENCE_WRAPPER_H #define _LIBCPP___FUNCTIONAL_REFERENCE_WRAPPER_H -#include <__compare/synth_three_way.h> -#include <__concepts/boolean_testable.h> -#include <__config> -#include <__functional/invoke.h> -#include <__functional/weak_result_type.h> -#include <__memory/addressof.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_const.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include <__utility/forward.h> +#include <__cxx03/__compare/synth_three_way.h> +#include <__cxx03/__concepts/boolean_testable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/weak_result_type.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/unary_function.h b/libcxx/include/__cxx03/__functional/unary_function.h index 69b1bc94220ae..9b1e336f15fa4 100644 --- a/libcxx/include/__cxx03/__functional/unary_function.h +++ b/libcxx/include/__cxx03/__functional/unary_function.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___FUNCTIONAL_UNARY_FUNCTION_H #define _LIBCPP___FUNCTIONAL_UNARY_FUNCTION_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/unary_negate.h b/libcxx/include/__cxx03/__functional/unary_negate.h index 5bd487a97bcb3..7154d234b007f 100644 --- a/libcxx/include/__cxx03/__functional/unary_negate.h +++ b/libcxx/include/__cxx03/__functional/unary_negate.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FUNCTIONAL_UNARY_NEGATE_H #define _LIBCPP___FUNCTIONAL_UNARY_NEGATE_H -#include <__config> -#include <__functional/unary_function.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/unary_function.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__functional/weak_result_type.h b/libcxx/include/__cxx03/__functional/weak_result_type.h index ad7a8395186cd..e1d62d634ca33 100644 --- a/libcxx/include/__cxx03/__functional/weak_result_type.h +++ b/libcxx/include/__cxx03/__functional/weak_result_type.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___FUNCTIONAL_WEAK_RESULT_TYPE_H #define _LIBCPP___FUNCTIONAL_WEAK_RESULT_TYPE_H -#include <__config> -#include <__functional/binary_function.h> -#include <__functional/invoke.h> -#include <__functional/unary_function.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/binary_function.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/array.h b/libcxx/include/__cxx03/__fwd/array.h index b429d0c5a9542..36343371a2278 100644 --- a/libcxx/include/__cxx03/__fwd/array.h +++ b/libcxx/include/__cxx03/__fwd/array.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_ARRAY_H #define _LIBCPP___FWD_ARRAY_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/bit_reference.h b/libcxx/include/__cxx03/__fwd/bit_reference.h index 237efb6db6642..6bda18429c392 100644 --- a/libcxx/include/__cxx03/__fwd/bit_reference.h +++ b/libcxx/include/__cxx03/__fwd/bit_reference.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___FWD_BIT_REFERENCE_H #define _LIBCPP___FWD_BIT_REFERENCE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/complex.h b/libcxx/include/__cxx03/__fwd/complex.h index 22c78c5cc3c77..e734f2db76fed 100644 --- a/libcxx/include/__cxx03/__fwd/complex.h +++ b/libcxx/include/__cxx03/__fwd/complex.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_COMPLEX_H #define _LIBCPP___FWD_COMPLEX_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/deque.h b/libcxx/include/__cxx03/__fwd/deque.h index fd2fb5bb4b8e9..31aa8c3799823 100644 --- a/libcxx/include/__cxx03/__fwd/deque.h +++ b/libcxx/include/__cxx03/__fwd/deque.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_DEQUE_H #define _LIBCPP___FWD_DEQUE_H -#include <__config> -#include <__fwd/memory.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/memory.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/format.h b/libcxx/include/__cxx03/__fwd/format.h index b30c220f8a043..2777433363270 100644 --- a/libcxx/include/__cxx03/__fwd/format.h +++ b/libcxx/include/__cxx03/__fwd/format.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FWD_FORMAT_H #define _LIBCPP___FWD_FORMAT_H -#include <__config> -#include <__iterator/concepts.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/fstream.h b/libcxx/include/__cxx03/__fwd/fstream.h index b4a112bfd4de6..717cb8c583d2c 100644 --- a/libcxx/include/__cxx03/__fwd/fstream.h +++ b/libcxx/include/__cxx03/__fwd/fstream.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_FSTREAM_H #define _LIBCPP___FWD_FSTREAM_H -#include <__config> -#include <__fwd/string.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/string.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/functional.h b/libcxx/include/__cxx03/__fwd/functional.h index 32c9ef33e453b..e19b802d178ca 100644 --- a/libcxx/include/__cxx03/__fwd/functional.h +++ b/libcxx/include/__cxx03/__fwd/functional.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___FWD_FUNCTIONAL_H #define _LIBCPP___FWD_FUNCTIONAL_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/ios.h b/libcxx/include/__cxx03/__fwd/ios.h index 48350709d4ce2..f5ba74c9067bb 100644 --- a/libcxx/include/__cxx03/__fwd/ios.h +++ b/libcxx/include/__cxx03/__fwd/ios.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_IOS_H #define _LIBCPP___FWD_IOS_H -#include <__config> -#include <__fwd/string.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/string.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/istream.h b/libcxx/include/__cxx03/__fwd/istream.h index a06907a6c8ef9..8975ac26baf6b 100644 --- a/libcxx/include/__cxx03/__fwd/istream.h +++ b/libcxx/include/__cxx03/__fwd/istream.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_ISTREAM_H #define _LIBCPP___FWD_ISTREAM_H -#include <__config> -#include <__fwd/string.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/string.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/mdspan.h b/libcxx/include/__cxx03/__fwd/mdspan.h index 8889567a047f6..369da43a20296 100644 --- a/libcxx/include/__cxx03/__fwd/mdspan.h +++ b/libcxx/include/__cxx03/__fwd/mdspan.h @@ -17,14 +17,14 @@ #ifndef _LIBCPP___MDSPAN_LAYOUTS_H #define _LIBCPP___MDSPAN_LAYOUTS_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__fwd/memory.h b/libcxx/include/__cxx03/__fwd/memory.h index b9e151855ad7d..109c0f6c051cc 100644 --- a/libcxx/include/__cxx03/__fwd/memory.h +++ b/libcxx/include/__cxx03/__fwd/memory.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___FWD_MEMORY_H #define _LIBCPP___FWD_MEMORY_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/memory_resource.h b/libcxx/include/__cxx03/__fwd/memory_resource.h index d68b2c2b63154..1a12dcfdaf627 100644 --- a/libcxx/include/__cxx03/__fwd/memory_resource.h +++ b/libcxx/include/__cxx03/__fwd/memory_resource.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___FWD_MEMORY_RESOURCE_H #define _LIBCPP___FWD_MEMORY_RESOURCE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/ostream.h b/libcxx/include/__cxx03/__fwd/ostream.h index 3347e0f71d7a1..5660ac50aac46 100644 --- a/libcxx/include/__cxx03/__fwd/ostream.h +++ b/libcxx/include/__cxx03/__fwd/ostream.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_OSTREAM_H #define _LIBCPP___FWD_OSTREAM_H -#include <__config> -#include <__fwd/string.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/string.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/pair.h b/libcxx/include/__cxx03/__fwd/pair.h index af32628fe1e0d..36656b454e0ab 100644 --- a/libcxx/include/__cxx03/__fwd/pair.h +++ b/libcxx/include/__cxx03/__fwd/pair.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___FWD_PAIR_H #define _LIBCPP___FWD_PAIR_H -#include <__config> -#include <__fwd/tuple.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/queue.h b/libcxx/include/__cxx03/__fwd/queue.h index 50d99ad9c29f4..54afd5113ba89 100644 --- a/libcxx/include/__cxx03/__fwd/queue.h +++ b/libcxx/include/__cxx03/__fwd/queue.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___FWD_QUEUE_H #define _LIBCPP___FWD_QUEUE_H -#include <__config> -#include <__functional/operations.h> -#include <__fwd/deque.h> -#include <__fwd/vector.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__fwd/deque.h> +#include <__cxx03/__fwd/vector.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/span.h b/libcxx/include/__cxx03/__fwd/span.h index 8dafa742c19df..47ed3eca5af4e 100644 --- a/libcxx/include/__cxx03/__fwd/span.h +++ b/libcxx/include/__cxx03/__fwd/span.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___FWD_SPAN_H #define _LIBCPP___FWD_SPAN_H -#include <__config> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__fwd/sstream.h b/libcxx/include/__cxx03/__fwd/sstream.h index 39a9c3faf1f80..cbb53942c8212 100644 --- a/libcxx/include/__cxx03/__fwd/sstream.h +++ b/libcxx/include/__cxx03/__fwd/sstream.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___FWD_SSTREAM_H #define _LIBCPP___FWD_SSTREAM_H -#include <__config> -#include <__fwd/memory.h> -#include <__fwd/string.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/memory.h> +#include <__cxx03/__fwd/string.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/stack.h b/libcxx/include/__cxx03/__fwd/stack.h index 7dab6c1a4f4e2..ddd7a67615876 100644 --- a/libcxx/include/__cxx03/__fwd/stack.h +++ b/libcxx/include/__cxx03/__fwd/stack.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_STACK_H #define _LIBCPP___FWD_STACK_H -#include <__config> -#include <__fwd/deque.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/deque.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/streambuf.h b/libcxx/include/__cxx03/__fwd/streambuf.h index b35afa6afe343..0b448836aaef7 100644 --- a/libcxx/include/__cxx03/__fwd/streambuf.h +++ b/libcxx/include/__cxx03/__fwd/streambuf.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_STREAMBUF_H #define _LIBCPP___FWD_STREAMBUF_H -#include <__config> -#include <__fwd/string.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/string.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/string.h b/libcxx/include/__cxx03/__fwd/string.h index 2418e1f9b23d0..07042b205e8ce 100644 --- a/libcxx/include/__cxx03/__fwd/string.h +++ b/libcxx/include/__cxx03/__fwd/string.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___FWD_STRING_H #define _LIBCPP___FWD_STRING_H -#include <__config> -#include <__fwd/memory.h> -#include <__fwd/memory_resource.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/memory.h> +#include <__cxx03/__fwd/memory_resource.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/string_view.h b/libcxx/include/__cxx03/__fwd/string_view.h index 72a64be5b00b5..25ee230d20b33 100644 --- a/libcxx/include/__cxx03/__fwd/string_view.h +++ b/libcxx/include/__cxx03/__fwd/string_view.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___FWD_STRING_VIEW_H #define _LIBCPP___FWD_STRING_VIEW_H -#include <__config> -#include <__fwd/string.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/string.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/subrange.h b/libcxx/include/__cxx03/__fwd/subrange.h index 60a41da23dd44..aebbd866daeb2 100644 --- a/libcxx/include/__cxx03/__fwd/subrange.h +++ b/libcxx/include/__cxx03/__fwd/subrange.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___FWD_SUBRANGE_H #define _LIBCPP___FWD_SUBRANGE_H -#include <__concepts/copyable.h> -#include <__config> -#include <__iterator/concepts.h> -#include +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/tuple.h b/libcxx/include/__cxx03/__fwd/tuple.h index 902770c29555e..75f7e6d16aecf 100644 --- a/libcxx/include/__cxx03/__fwd/tuple.h +++ b/libcxx/include/__cxx03/__fwd/tuple.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_TUPLE_H #define _LIBCPP___FWD_TUPLE_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__fwd/vector.h b/libcxx/include/__cxx03/__fwd/vector.h index c9cc96137449f..3ed34e9f9ef94 100644 --- a/libcxx/include/__cxx03/__fwd/vector.h +++ b/libcxx/include/__cxx03/__fwd/vector.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___FWD_VECTOR_H #define _LIBCPP___FWD_VECTOR_H -#include <__config> -#include <__fwd/memory.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/memory.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__hash_table b/libcxx/include/__cxx03/__hash_table index 025758528573f..348dcaf01e8c1 100644 --- a/libcxx/include/__cxx03/__hash_table +++ b/libcxx/include/__cxx03/__hash_table @@ -10,47 +10,47 @@ #ifndef _LIBCPP___HASH_TABLE #define _LIBCPP___HASH_TABLE -#include <__algorithm/max.h> -#include <__algorithm/min.h> -#include <__assert> -#include <__bit/countl.h> -#include <__config> -#include <__functional/hash.h> -#include <__functional/invoke.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__memory/allocator_traits.h> -#include <__memory/compressed_pair.h> -#include <__memory/construct_at.h> -#include <__memory/pointer_traits.h> -#include <__memory/swap_allocator.h> -#include <__memory/unique_ptr.h> -#include <__type_traits/can_extract_key.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include <__utility/swap.h> -#include -#include -#include -#include // __launder +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/swap_allocator.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__type_traits/can_extract_key.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cmath> +#include <__cxx03/cstring> +#include <__cxx03/initializer_list> +#include <__cxx03/new> // __launder #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ios/fpos.h b/libcxx/include/__cxx03/__ios/fpos.h index 1af1e23ee50da..20904d3f5469d 100644 --- a/libcxx/include/__cxx03/__ios/fpos.h +++ b/libcxx/include/__cxx03/__ios/fpos.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___IOS_FPOS_H #define _LIBCPP___IOS_FPOS_H -#include <__config> -#include <__fwd/ios.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/ios.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/access.h b/libcxx/include/__cxx03/__iterator/access.h index acc4f60bf697e..2d7b4d16d47ce 100644 --- a/libcxx/include/__cxx03/__iterator/access.h +++ b/libcxx/include/__cxx03/__iterator/access.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___ITERATOR_ACCESS_H #define _LIBCPP___ITERATOR_ACCESS_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/advance.h b/libcxx/include/__cxx03/__iterator/advance.h index 296db1aaab652..b481652fbed9d 100644 --- a/libcxx/include/__cxx03/__iterator/advance.h +++ b/libcxx/include/__cxx03/__iterator/advance.h @@ -10,27 +10,27 @@ #ifndef _LIBCPP___ITERATOR_ADVANCE_H #define _LIBCPP___ITERATOR_ADVANCE_H -#include <__assert> -#include <__concepts/assignable.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> -#include <__utility/convert_to_integral.h> -#include <__utility/declval.h> -#include <__utility/move.h> -#include <__utility/unreachable.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__utility/convert_to_integral.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/aliasing_iterator.h b/libcxx/include/__cxx03/__iterator/aliasing_iterator.h index 94ba577078b5e..a3cb555606cb9 100644 --- a/libcxx/include/__cxx03/__iterator/aliasing_iterator.h +++ b/libcxx/include/__cxx03/__iterator/aliasing_iterator.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ITERATOR_ALIASING_ITERATOR_H #define _LIBCPP___ITERATOR_ALIASING_ITERATOR_H -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__memory/pointer_traits.h> -#include <__type_traits/is_trivial.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__type_traits/is_trivial.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/back_insert_iterator.h b/libcxx/include/__cxx03/__iterator/back_insert_iterator.h index 6d3dd4b12966f..2d3885951d134 100644 --- a/libcxx/include/__cxx03/__iterator/back_insert_iterator.h +++ b/libcxx/include/__cxx03/__iterator/back_insert_iterator.h @@ -10,19 +10,19 @@ #ifndef _LIBCPP___ITERATOR_BACK_INSERT_ITERATOR_H #define _LIBCPP___ITERATOR_BACK_INSERT_ITERATOR_H -#include <__config> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/bounded_iter.h b/libcxx/include/__cxx03/__iterator/bounded_iter.h index 8a81c9ffbfc3f..dc93be089b1b8 100644 --- a/libcxx/include/__cxx03/__iterator/bounded_iter.h +++ b/libcxx/include/__cxx03/__iterator/bounded_iter.h @@ -10,23 +10,23 @@ #ifndef _LIBCPP___ITERATOR_BOUNDED_ITER_H #define _LIBCPP___ITERATOR_BOUNDED_ITER_H -#include <__assert> -#include <__compare/ordering.h> -#include <__compare/three_way_comparable.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__memory/pointer_traits.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_convertible.h> -#include <__utility/move.h> +#include <__cxx03/__assert> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/common_iterator.h b/libcxx/include/__cxx03/__iterator/common_iterator.h index 199de2cc7337b..5d052c0d2300e 100644 --- a/libcxx/include/__cxx03/__iterator/common_iterator.h +++ b/libcxx/include/__cxx03/__iterator/common_iterator.h @@ -10,32 +10,32 @@ #ifndef _LIBCPP___ITERATOR_COMMON_ITERATOR_H #define _LIBCPP___ITERATOR_COMMON_ITERATOR_H -#include <__assert> -#include <__concepts/assignable.h> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/copyable.h> -#include <__concepts/derived_from.h> -#include <__concepts/equality_comparable.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/readable_traits.h> -#include <__memory/addressof.h> -#include <__type_traits/is_pointer.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/variant> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/concepts.h b/libcxx/include/__cxx03/__iterator/concepts.h index 0a4878308d55f..4973ce8987ef5 100644 --- a/libcxx/include/__cxx03/__iterator/concepts.h +++ b/libcxx/include/__cxx03/__iterator/concepts.h @@ -10,35 +10,35 @@ #ifndef _LIBCPP___ITERATOR_CONCEPTS_H #define _LIBCPP___ITERATOR_CONCEPTS_H -#include <__concepts/arithmetic.h> -#include <__concepts/assignable.h> -#include <__concepts/common_reference_with.h> -#include <__concepts/constructible.h> -#include <__concepts/copyable.h> -#include <__concepts/derived_from.h> -#include <__concepts/equality_comparable.h> -#include <__concepts/invocable.h> -#include <__concepts/movable.h> -#include <__concepts/predicate.h> -#include <__concepts/regular.h> -#include <__concepts/relation.h> -#include <__concepts/same_as.h> -#include <__concepts/semiregular.h> -#include <__concepts/totally_ordered.h> -#include <__config> -#include <__functional/invoke.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/readable_traits.h> -#include <__memory/pointer_traits.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/common_reference.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/common_reference_with.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__concepts/movable.h> +#include <__cxx03/__concepts/predicate.h> +#include <__cxx03/__concepts/regular.h> +#include <__cxx03/__concepts/relation.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__concepts/semiregular.h> +#include <__cxx03/__concepts/totally_ordered.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/common_reference.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/counted_iterator.h b/libcxx/include/__cxx03/__iterator/counted_iterator.h index ea2832e3b978d..161ec54a1a3b1 100644 --- a/libcxx/include/__cxx03/__iterator/counted_iterator.h +++ b/libcxx/include/__cxx03/__iterator/counted_iterator.h @@ -10,32 +10,32 @@ #ifndef _LIBCPP___ITERATOR_COUNTED_ITERATOR_H #define _LIBCPP___ITERATOR_COUNTED_ITERATOR_H -#include <__assert> -#include <__concepts/assignable.h> -#include <__concepts/common_with.h> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/readable_traits.h> -#include <__memory/pointer_traits.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/conditional.h> -#include <__utility/move.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/common_with.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/cpp17_iterator_concepts.h b/libcxx/include/__cxx03/__iterator/cpp17_iterator_concepts.h index ba3536b686099..938884e5f69ab 100644 --- a/libcxx/include/__cxx03/__iterator/cpp17_iterator_concepts.h +++ b/libcxx/include/__cxx03/__iterator/cpp17_iterator_concepts.h @@ -9,26 +9,26 @@ #ifndef _LIBCPP___ITERATOR_CPP17_ITERATOR_CONCEPTS_H #define _LIBCPP___ITERATOR_CPP17_ITERATOR_CONCEPTS_H -#include <__concepts/boolean_testable.h> -#include <__concepts/convertible_to.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_signed.h> -#include <__type_traits/is_void.h> -#include <__utility/as_const.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/swap.h> +#include <__cxx03/__concepts/boolean_testable.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__utility/as_const.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__iterator/data.h b/libcxx/include/__cxx03/__iterator/data.h index b7c1603652b0e..d4f26608d9c49 100644 --- a/libcxx/include/__cxx03/__iterator/data.h +++ b/libcxx/include/__cxx03/__iterator/data.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___ITERATOR_DATA_H #define _LIBCPP___ITERATOR_DATA_H -#include <__config> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/default_sentinel.h b/libcxx/include/__cxx03/__iterator/default_sentinel.h index 3b65f442f1a85..fd05aeb59bce6 100644 --- a/libcxx/include/__cxx03/__iterator/default_sentinel.h +++ b/libcxx/include/__cxx03/__iterator/default_sentinel.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___ITERATOR_DEFAULT_SENTINEL_H #define _LIBCPP___ITERATOR_DEFAULT_SENTINEL_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/distance.h b/libcxx/include/__cxx03/__iterator/distance.h index 75bd49c9ae732..20f2771efc437 100644 --- a/libcxx/include/__cxx03/__iterator/distance.h +++ b/libcxx/include/__cxx03/__iterator/distance.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___ITERATOR_DISTANCE_H #define _LIBCPP___ITERATOR_DISTANCE_H -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/size.h> -#include <__type_traits/decay.h> -#include <__type_traits/remove_cvref.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/remove_cvref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/empty.h b/libcxx/include/__cxx03/__iterator/empty.h index 773f2776955b2..0bdb951ee12dc 100644 --- a/libcxx/include/__cxx03/__iterator/empty.h +++ b/libcxx/include/__cxx03/__iterator/empty.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___ITERATOR_EMPTY_H #define _LIBCPP___ITERATOR_EMPTY_H -#include <__config> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/erase_if_container.h b/libcxx/include/__cxx03/__iterator/erase_if_container.h index 0f87f50cd1c16..11edc9c1312c6 100644 --- a/libcxx/include/__cxx03/__iterator/erase_if_container.h +++ b/libcxx/include/__cxx03/__iterator/erase_if_container.h @@ -10,14 +10,14 @@ #ifndef _LIBCPP___ITERATOR_ERASE_IF_CONTAINER_H #define _LIBCPP___ITERATOR_ERASE_IF_CONTAINER_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/front_insert_iterator.h b/libcxx/include/__cxx03/__iterator/front_insert_iterator.h index 7f2c54ec87442..f28b82840f2d6 100644 --- a/libcxx/include/__cxx03/__iterator/front_insert_iterator.h +++ b/libcxx/include/__cxx03/__iterator/front_insert_iterator.h @@ -10,19 +10,19 @@ #ifndef _LIBCPP___ITERATOR_FRONT_INSERT_ITERATOR_H #define _LIBCPP___ITERATOR_FRONT_INSERT_ITERATOR_H -#include <__config> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/incrementable_traits.h b/libcxx/include/__cxx03/__iterator/incrementable_traits.h index a228b228f6e55..8eb730bfe8ad9 100644 --- a/libcxx/include/__cxx03/__iterator/incrementable_traits.h +++ b/libcxx/include/__cxx03/__iterator/incrementable_traits.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___ITERATOR_INCREMENTABLE_TRAITS_H #define _LIBCPP___ITERATOR_INCREMENTABLE_TRAITS_H -#include <__concepts/arithmetic.h> -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_primary_template.h> -#include <__type_traits/make_signed.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_primary_template.h> +#include <__cxx03/__type_traits/make_signed.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/indirectly_comparable.h b/libcxx/include/__cxx03/__iterator/indirectly_comparable.h index e8a7398bacd2b..caef64e6f0831 100644 --- a/libcxx/include/__cxx03/__iterator/indirectly_comparable.h +++ b/libcxx/include/__cxx03/__iterator/indirectly_comparable.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___ITERATOR_INDIRECTLY_COMPARABLE_H #define _LIBCPP___ITERATOR_INDIRECTLY_COMPARABLE_H -#include <__config> -#include <__functional/identity.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/insert_iterator.h b/libcxx/include/__cxx03/__iterator/insert_iterator.h index 8b7574dc9ec0a..1f0320acda854 100644 --- a/libcxx/include/__cxx03/__iterator/insert_iterator.h +++ b/libcxx/include/__cxx03/__iterator/insert_iterator.h @@ -10,20 +10,20 @@ #ifndef _LIBCPP___ITERATOR_INSERT_ITERATOR_H #define _LIBCPP___ITERATOR_INSERT_ITERATOR_H -#include <__config> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/istream_iterator.h b/libcxx/include/__cxx03/__iterator/istream_iterator.h index 58c9ac6d4ccce..af0f477358a7f 100644 --- a/libcxx/include/__cxx03/__iterator/istream_iterator.h +++ b/libcxx/include/__cxx03/__iterator/istream_iterator.h @@ -10,14 +10,14 @@ #ifndef _LIBCPP___ITERATOR_ISTREAM_ITERATOR_H #define _LIBCPP___ITERATOR_ISTREAM_ITERATOR_H -#include <__config> -#include <__fwd/istream.h> -#include <__fwd/string.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/istream.h> +#include <__cxx03/__fwd/string.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/istreambuf_iterator.h b/libcxx/include/__cxx03/__iterator/istreambuf_iterator.h index 51c4ecff351f5..73da595172848 100644 --- a/libcxx/include/__cxx03/__iterator/istreambuf_iterator.h +++ b/libcxx/include/__cxx03/__iterator/istreambuf_iterator.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___ITERATOR_ISTREAMBUF_ITERATOR_H #define _LIBCPP___ITERATOR_ISTREAMBUF_ITERATOR_H -#include <__config> -#include <__fwd/istream.h> -#include <__fwd/streambuf.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/istream.h> +#include <__cxx03/__fwd/streambuf.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/iter_move.h b/libcxx/include/__cxx03/__iterator/iter_move.h index ba8aed3c0ffbb..f45baaaf63c1e 100644 --- a/libcxx/include/__cxx03/__iterator/iter_move.h +++ b/libcxx/include/__cxx03/__iterator/iter_move.h @@ -10,21 +10,21 @@ #ifndef _LIBCPP___ITERATOR_ITER_MOVE_H #define _LIBCPP___ITERATOR_ITER_MOVE_H -#include <__concepts/class_or_enum.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/iter_swap.h b/libcxx/include/__cxx03/__iterator/iter_swap.h index 01ab1b97d6501..c3a7e164ff6f6 100644 --- a/libcxx/include/__cxx03/__iterator/iter_swap.h +++ b/libcxx/include/__cxx03/__iterator/iter_swap.h @@ -10,24 +10,24 @@ #ifndef _LIBCPP___ITERATOR_ITER_SWAP_H #define _LIBCPP___ITERATOR_ITER_SWAP_H -#include <__concepts/class_or_enum.h> -#include <__concepts/swappable.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iter_move.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/readable_traits.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__concepts/swappable.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/iterator.h b/libcxx/include/__cxx03/__iterator/iterator.h index ba9308f3c2243..5d9648ce6a06f 100644 --- a/libcxx/include/__cxx03/__iterator/iterator.h +++ b/libcxx/include/__cxx03/__iterator/iterator.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___ITERATOR_ITERATOR_H #define _LIBCPP___ITERATOR_ITERATOR_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/iterator_traits.h b/libcxx/include/__cxx03/__iterator/iterator_traits.h index 11af9e301842c..7efd0c81c9301 100644 --- a/libcxx/include/__cxx03/__iterator/iterator_traits.h +++ b/libcxx/include/__cxx03/__iterator/iterator_traits.h @@ -10,31 +10,31 @@ #ifndef _LIBCPP___ITERATOR_ITERATOR_TRAITS_H #define _LIBCPP___ITERATOR_ITERATOR_TRAITS_H -#include <__concepts/arithmetic.h> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/copyable.h> -#include <__concepts/equality_comparable.h> -#include <__concepts/same_as.h> -#include <__concepts/totally_ordered.h> -#include <__config> -#include <__fwd/pair.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/readable_traits.h> -#include <__type_traits/common_reference.h> -#include <__type_traits/conditional.h> -#include <__type_traits/disjunction.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_primary_template.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_valid_expansion.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__concepts/totally_ordered.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/pair.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__type_traits/common_reference.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/disjunction.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_primary_template.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_valid_expansion.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/iterator_with_data.h b/libcxx/include/__cxx03/__iterator/iterator_with_data.h index afdc0a4e12e21..fb7aafc38a30c 100644 --- a/libcxx/include/__cxx03/__iterator/iterator_with_data.h +++ b/libcxx/include/__cxx03/__iterator/iterator_with_data.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___ITERATOR_ITERATOR_WITH_DATA_H #define _LIBCPP___ITERATOR_ITERATOR_WITH_DATA_H -#include <__compare/compare_three_way_result.h> -#include <__compare/three_way_comparable.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/readable_traits.h> -#include <__utility/move.h> +#include <__cxx03/__compare/compare_three_way_result.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__cxx03/__iterator/mergeable.h b/libcxx/include/__cxx03/__iterator/mergeable.h index 7976d751095e5..a20134defa87f 100644 --- a/libcxx/include/__cxx03/__iterator/mergeable.h +++ b/libcxx/include/__cxx03/__iterator/mergeable.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___ITERATOR_MERGEABLE_H #define _LIBCPP___ITERATOR_MERGEABLE_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/projected.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/projected.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/move_iterator.h b/libcxx/include/__cxx03/__iterator/move_iterator.h index a1c53e9bd2b59..701f51a272da6 100644 --- a/libcxx/include/__cxx03/__iterator/move_iterator.h +++ b/libcxx/include/__cxx03/__iterator/move_iterator.h @@ -10,37 +10,37 @@ #ifndef _LIBCPP___ITERATOR_MOVE_ITERATOR_H #define _LIBCPP___ITERATOR_MOVE_ITERATOR_H -#include <__compare/compare_three_way_result.h> -#include <__compare/three_way_comparable.h> -#include <__concepts/assignable.h> -#include <__concepts/convertible_to.h> -#include <__concepts/derived_from.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/move_sentinel.h> -#include <__iterator/readable_traits.h> -#include <__type_traits/conditional.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_reference.h> -#include <__utility/declval.h> -#include <__utility/move.h> +#include <__cxx03/__compare/compare_three_way_result.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/move_sentinel.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/move_sentinel.h b/libcxx/include/__cxx03/__iterator/move_sentinel.h index 4a2a09ef0611d..8518bcf39ea96 100644 --- a/libcxx/include/__cxx03/__iterator/move_sentinel.h +++ b/libcxx/include/__cxx03/__iterator/move_sentinel.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___ITERATOR_MOVE_SENTINEL_H #define _LIBCPP___ITERATOR_MOVE_SENTINEL_H -#include <__concepts/assignable.h> -#include <__concepts/convertible_to.h> -#include <__concepts/semiregular.h> -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__concepts/assignable.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/semiregular.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__iterator/next.h b/libcxx/include/__cxx03/__iterator/next.h index 21d3688ad9eb6..554760347606e 100644 --- a/libcxx/include/__cxx03/__iterator/next.h +++ b/libcxx/include/__cxx03/__iterator/next.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___ITERATOR_NEXT_H #define _LIBCPP___ITERATOR_NEXT_H -#include <__assert> -#include <__config> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/enable_if.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/enable_if.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/ostream_iterator.h b/libcxx/include/__cxx03/__iterator/ostream_iterator.h index 05697e62d9dcb..40f737831f409 100644 --- a/libcxx/include/__cxx03/__iterator/ostream_iterator.h +++ b/libcxx/include/__cxx03/__iterator/ostream_iterator.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___ITERATOR_OSTREAM_ITERATOR_H #define _LIBCPP___ITERATOR_OSTREAM_ITERATOR_H -#include <__config> -#include <__fwd/ostream.h> -#include <__fwd/string.h> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/ostream.h> +#include <__cxx03/__fwd/string.h> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/ostreambuf_iterator.h b/libcxx/include/__cxx03/__iterator/ostreambuf_iterator.h index dda0094dc3f53..363279ce684e1 100644 --- a/libcxx/include/__cxx03/__iterator/ostreambuf_iterator.h +++ b/libcxx/include/__cxx03/__iterator/ostreambuf_iterator.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___ITERATOR_OSTREAMBUF_ITERATOR_H #define _LIBCPP___ITERATOR_OSTREAMBUF_ITERATOR_H -#include <__config> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include -#include // for forward declaration of basic_streambuf +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/cstddef> +#include <__cxx03/iosfwd> // for forward declaration of basic_streambuf #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/permutable.h b/libcxx/include/__cxx03/__iterator/permutable.h index f65ba3bfbbaad..c2641de963de7 100644 --- a/libcxx/include/__cxx03/__iterator/permutable.h +++ b/libcxx/include/__cxx03/__iterator/permutable.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___ITERATOR_PERMUTABLE_H #define _LIBCPP___ITERATOR_PERMUTABLE_H -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iter_swap.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iter_swap.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/prev.h b/libcxx/include/__cxx03/__iterator/prev.h index 2f0e6a088edb3..7256f2d0a0faf 100644 --- a/libcxx/include/__cxx03/__iterator/prev.h +++ b/libcxx/include/__cxx03/__iterator/prev.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___ITERATOR_PREV_H #define _LIBCPP___ITERATOR_PREV_H -#include <__assert> -#include <__config> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__type_traits/enable_if.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/enable_if.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/projected.h b/libcxx/include/__cxx03/__iterator/projected.h index 463d07b0d33c2..582d192120620 100644 --- a/libcxx/include/__cxx03/__iterator/projected.h +++ b/libcxx/include/__cxx03/__iterator/projected.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___ITERATOR_PROJECTED_H #define _LIBCPP___ITERATOR_PROJECTED_H -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> // iter_difference_t -#include <__type_traits/remove_cvref.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> // iter_difference_t +#include <__cxx03/__type_traits/remove_cvref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/ranges_iterator_traits.h b/libcxx/include/__cxx03/__iterator/ranges_iterator_traits.h index 859e7082048ac..0dddc16575390 100644 --- a/libcxx/include/__cxx03/__iterator/ranges_iterator_traits.h +++ b/libcxx/include/__cxx03/__iterator/ranges_iterator_traits.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___ITERATOR_RANGES_ITERATOR_TRAITS_H #define _LIBCPP___ITERATOR_RANGES_ITERATOR_TRAITS_H -#include <__config> -#include <__fwd/pair.h> -#include <__ranges/concepts.h> -#include <__type_traits/remove_const.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/pair.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/remove_const.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/readable_traits.h b/libcxx/include/__cxx03/__iterator/readable_traits.h index 25e74567fff11..07ef5e30ae469 100644 --- a/libcxx/include/__cxx03/__iterator/readable_traits.h +++ b/libcxx/include/__cxx03/__iterator/readable_traits.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___ITERATOR_READABLE_TRAITS_H #define _LIBCPP___ITERATOR_READABLE_TRAITS_H -#include <__concepts/same_as.h> -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_primary_template.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_extent.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_primary_template.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_extent.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/reverse_access.h b/libcxx/include/__cxx03/__iterator/reverse_access.h index 54d7270b04a53..f307fd8b01b5e 100644 --- a/libcxx/include/__cxx03/__iterator/reverse_access.h +++ b/libcxx/include/__cxx03/__iterator/reverse_access.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___ITERATOR_REVERSE_ACCESS_H #define _LIBCPP___ITERATOR_REVERSE_ACCESS_H -#include <__config> -#include <__iterator/reverse_iterator.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/cstddef> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/reverse_iterator.h b/libcxx/include/__cxx03/__iterator/reverse_iterator.h index 50c0f21eaa286..000da25a0f330 100644 --- a/libcxx/include/__cxx03/__iterator/reverse_iterator.h +++ b/libcxx/include/__cxx03/__iterator/reverse_iterator.h @@ -10,35 +10,35 @@ #ifndef _LIBCPP___ITERATOR_REVERSE_ITERATOR_H #define _LIBCPP___ITERATOR_REVERSE_ITERATOR_H -#include <__algorithm/unwrap_iter.h> -#include <__compare/compare_three_way_result.h> -#include <__compare/three_way_comparable.h> -#include <__concepts/convertible_to.h> -#include <__config> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/prev.h> -#include <__iterator/readable_traits.h> -#include <__iterator/segmented_iterator.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/subrange.h> -#include <__type_traits/conditional.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_same.h> -#include <__utility/declval.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__compare/compare_three_way_result.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/prev.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/segmented_iterator.h b/libcxx/include/__cxx03/__iterator/segmented_iterator.h index f3cd1e5fa1f5d..93cd8e195eb05 100644 --- a/libcxx/include/__cxx03/__iterator/segmented_iterator.h +++ b/libcxx/include/__cxx03/__iterator/segmented_iterator.h @@ -40,9 +40,9 @@ // - static It Traits::__compose(__segment_iterator, __local_iterator) // Returns the iterator composed of the segment iterator and local iterator. -#include <__config> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/size.h b/libcxx/include/__cxx03/__iterator/size.h index 876e6963f77d9..3e8c2537f723f 100644 --- a/libcxx/include/__cxx03/__iterator/size.h +++ b/libcxx/include/__cxx03/__iterator/size.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___ITERATOR_SIZE_H #define _LIBCPP___ITERATOR_SIZE_H -#include <__config> -#include <__type_traits/common_type.h> -#include <__type_traits/make_signed.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/make_signed.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/sortable.h b/libcxx/include/__cxx03/__iterator/sortable.h index 1444860f2aa10..5c32629ff55dc 100644 --- a/libcxx/include/__cxx03/__iterator/sortable.h +++ b/libcxx/include/__cxx03/__iterator/sortable.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___ITERATOR_SORTABLE_H #define _LIBCPP___ITERATOR_SORTABLE_H -#include <__config> -#include <__functional/identity.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/permutable.h> -#include <__iterator/projected.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/permutable.h> +#include <__cxx03/__iterator/projected.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/unreachable_sentinel.h b/libcxx/include/__cxx03/__iterator/unreachable_sentinel.h index 77e663da4b3a6..29d76409728ff 100644 --- a/libcxx/include/__cxx03/__iterator/unreachable_sentinel.h +++ b/libcxx/include/__cxx03/__iterator/unreachable_sentinel.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___ITERATOR_UNREACHABLE_SENTINEL_H #define _LIBCPP___ITERATOR_UNREACHABLE_SENTINEL_H -#include <__config> -#include <__iterator/concepts.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__iterator/wrap_iter.h b/libcxx/include/__cxx03/__iterator/wrap_iter.h index 56183c0ee794d..5ef909c19f4d3 100644 --- a/libcxx/include/__cxx03/__iterator/wrap_iter.h +++ b/libcxx/include/__cxx03/__iterator/wrap_iter.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___ITERATOR_WRAP_ITER_H #define _LIBCPP___ITERATOR_WRAP_ITER_H -#include <__compare/ordering.h> -#include <__compare/three_way_comparable.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__memory/pointer_traits.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_convertible.h> -#include +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__locale b/libcxx/include/__cxx03/__locale index 4b382764b4464..7324aa2390ce7 100644 --- a/libcxx/include/__cxx03/__locale +++ b/libcxx/include/__cxx03/__locale @@ -10,27 +10,27 @@ #ifndef _LIBCPP___LOCALE #define _LIBCPP___LOCALE -#include <__config> -#include <__locale_dir/locale_base_api.h> -#include <__memory/shared_ptr.h> // __shared_count -#include <__mutex/once_flag.h> -#include <__type_traits/make_unsigned.h> -#include <__utility/no_destroy.h> -#include <__utility/private_constructor_tag.h> -#include -#include -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__locale_dir/locale_base_api.h> +#include <__cxx03/__memory/shared_ptr.h> // __shared_count +#include <__cxx03/__mutex/once_flag.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__utility/no_destroy.h> +#include <__cxx03/__utility/private_constructor_tag.h> +#include <__cxx03/cctype> +#include <__cxx03/clocale> +#include <__cxx03/cstdint> +#include <__cxx03/cstdlib> +#include <__cxx03/string> // Some platforms require more includes than others. Keep the includes on all plaforms for now. -#include -#include +#include <__cxx03/cstddef> +#include <__cxx03/cstring> #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS -# include +# include <__cxx03/cwchar> #else -# include <__std_mbstate_t.h> +# include <__cxx03/__std_mbstate_t.h> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api.h index 8c000c558c527..e62ebb17765d7 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api.h @@ -10,23 +10,23 @@ #define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_H #if defined(_LIBCPP_MSVCRT_LIKE) -# include <__locale_dir/locale_base_api/win32.h> +# include <__cxx03/__locale_dir/locale_base_api/win32.h> #elif defined(_AIX) || defined(__MVS__) -# include <__locale_dir/locale_base_api/ibm.h> +# include <__cxx03/__locale_dir/locale_base_api/ibm.h> #elif defined(__ANDROID__) -# include <__locale_dir/locale_base_api/android.h> +# include <__cxx03/__locale_dir/locale_base_api/android.h> #elif defined(__sun__) -# include <__locale_dir/locale_base_api/solaris.h> +# include <__cxx03/__locale_dir/locale_base_api/solaris.h> #elif defined(_NEWLIB_VERSION) -# include <__locale_dir/locale_base_api/newlib.h> +# include <__cxx03/__locale_dir/locale_base_api/newlib.h> #elif defined(__OpenBSD__) -# include <__locale_dir/locale_base_api/openbsd.h> +# include <__cxx03/__locale_dir/locale_base_api/openbsd.h> #elif defined(__Fuchsia__) -# include <__locale_dir/locale_base_api/fuchsia.h> +# include <__cxx03/__locale_dir/locale_base_api/fuchsia.h> #elif defined(__wasi__) || defined(_LIBCPP_HAS_MUSL_LIBC) -# include <__locale_dir/locale_base_api/musl.h> +# include <__cxx03/__locale_dir/locale_base_api/musl.h> #elif defined(__APPLE__) || defined(__FreeBSD__) -# include +# include <__cxx03/xlocale.h> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/android.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/android.h index 9965d8bbf6a2e..b943c82c62298 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/android.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/android.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_ANDROID_H #define _LIBCPP___LOCALE_LOCALE_BASE_API_ANDROID_H -#include +#include <__cxx03/stdlib.h> // FIXME: Is this actually required? extern "C" { #include } -#include +#include <__cxx03/android/api-level.h> #if __ANDROID_API__ < 21 -# include <__support/xlocale/__posix_l_fallback.h> +# include <__cxx03/__support/xlocale/__posix_l_fallback.h> #endif // If we do not have this header, we are in a platform build rather than an NDK @@ -31,7 +31,7 @@ extern "C" { // legacy_stdlib_inlines.h # if __NDK_MAJOR__ <= 16 # if __ANDROID_API__ < 21 -# include <__support/xlocale/__strtonum_fallback.h> +# include <__cxx03/__support/xlocale/__strtonum_fallback.h> # elif __ANDROID_API__ < 26 inline _LIBCPP_HIDE_FROM_ABI float strtof_l(const char* __nptr, char** __endptr, locale_t) { diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/bsd_locale_fallbacks.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/bsd_locale_fallbacks.h index 76b94287cd6cc..129cacb317ee4 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/bsd_locale_fallbacks.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/bsd_locale_fallbacks.h @@ -13,13 +13,13 @@ #ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H #define _LIBCPP___LOCALE_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H -#include <__locale_dir/locale_base_api/locale_guard.h> -#include -#include -#include +#include <__cxx03/__locale_dir/locale_base_api/locale_guard.h> +#include <__cxx03/cstdio> +#include <__cxx03/stdarg.h> +#include <__cxx03/stdlib.h> #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS -# include +# include <__cxx03/cwchar> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/fuchsia.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/fuchsia.h index 4c3440f981c6d..74d017d07435d 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/fuchsia.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/fuchsia.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_FUCHSIA_H #define _LIBCPP___LOCALE_LOCALE_BASE_API_FUCHSIA_H -#include <__support/xlocale/__posix_l_fallback.h> -#include <__support/xlocale/__strtonum_fallback.h> -#include -#include +#include <__cxx03/__support/xlocale/__posix_l_fallback.h> +#include <__cxx03/__support/xlocale/__strtonum_fallback.h> +#include <__cxx03/cstdlib> +#include <__cxx03/cwchar> #endif // _LIBCPP___LOCALE_LOCALE_BASE_API_FUCHSIA_H diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/ibm.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/ibm.h index 01af20194428b..ff4b2f8b272cd 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/ibm.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/ibm.h @@ -11,19 +11,19 @@ #define _LIBCPP___LOCALE_LOCALE_BASE_API_IBM_H #if defined(__MVS__) -# include <__support/ibm/locale_mgmt_zos.h> +# include <__cxx03/__support/ibm/locale_mgmt_zos.h> #endif // defined(__MVS__) -#include -#include -#include +#include <__cxx03/locale.h> +#include <__cxx03/stdarg.h> +#include <__cxx03/stdio.h> #include "cstdlib" #if defined(__MVS__) -# include +# include <__cxx03/wctype.h> // POSIX routines -# include <__support/xlocale/__posix_l_fallback.h> +# include <__cxx03/__support/xlocale/__posix_l_fallback.h> #endif // defined(__MVS__) namespace { diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/locale_guard.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/locale_guard.h index 2baacb51cd065..17eade28f35f3 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/locale_guard.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/locale_guard.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_LOCALE_GUARD_H #define _LIBCPP___LOCALE_LOCALE_BASE_API_LOCALE_GUARD_H -#include <__config> -#include <__locale> // for locale_t -#include +#include <__cxx03/__config> +#include <__cxx03/__locale> // for locale_t +#include <__cxx03/clocale> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/musl.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/musl.h index bf7b849d58634..f67511f4a0bc5 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/musl.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/musl.h @@ -17,8 +17,8 @@ #ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_MUSL_H #define _LIBCPP___LOCALE_LOCALE_BASE_API_MUSL_H -#include -#include +#include <__cxx03/cstdlib> +#include <__cxx03/cwchar> inline _LIBCPP_HIDE_FROM_ABI long long strtoll_l(const char* __nptr, char** __endptr, int __base, locale_t) { return ::strtoll(__nptr, __endptr, __base); diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/openbsd.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/openbsd.h index 0c05d6a0f7887..effd9e0d76b1c 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/openbsd.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/openbsd.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_OPENBSD_H #define _LIBCPP___LOCALE_LOCALE_BASE_API_OPENBSD_H -#include <__support/xlocale/__strtonum_fallback.h> -#include -#include -#include -#include +#include <__cxx03/__support/xlocale/__strtonum_fallback.h> +#include <__cxx03/clocale> +#include <__cxx03/cstdlib> +#include <__cxx03/ctype.h> +#include <__cxx03/cwctype> #endif // _LIBCPP___LOCALE_LOCALE_BASE_API_OPENBSD_H diff --git a/libcxx/include/__cxx03/__locale_dir/locale_base_api/win32.h b/libcxx/include/__cxx03/__locale_dir/locale_base_api/win32.h index f66baffb69204..60b9435039f6d 100644 --- a/libcxx/include/__cxx03/__locale_dir/locale_base_api/win32.h +++ b/libcxx/include/__cxx03/__locale_dir/locale_base_api/win32.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_WIN32_H #define _LIBCPP___LOCALE_LOCALE_BASE_API_WIN32_H -#include <__config> -#include -#include // _locale_t -#include -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> +#include <__cxx03/locale.h> // _locale_t +#include <__cxx03/stdio.h> +#include <__cxx03/string> #define _X_ALL LC_ALL #define _X_COLLATE LC_COLLATE diff --git a/libcxx/include/__cxx03/__math/abs.h b/libcxx/include/__cxx03/__math/abs.h index ab82a2800f53c..ad543e654c48b 100644 --- a/libcxx/include/__cxx03/__math/abs.h +++ b/libcxx/include/__cxx03/__math/abs.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MATH_ABS_H #define _LIBCPP___MATH_ABS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/copysign.h b/libcxx/include/__cxx03/__math/copysign.h index b38690bb581a1..e70d6ee286774 100644 --- a/libcxx/include/__cxx03/__math/copysign.h +++ b/libcxx/include/__cxx03/__math/copysign.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___MATH_COPYSIGN_H #define _LIBCPP___MATH_COPYSIGN_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/promote.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/promote.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/error_functions.h b/libcxx/include/__cxx03/__math/error_functions.h index 6b528bb290001..47f506096676e 100644 --- a/libcxx/include/__cxx03/__math/error_functions.h +++ b/libcxx/include/__cxx03/__math/error_functions.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MATH_ERROR_FUNCTIONS_H #define _LIBCPP___MATH_ERROR_FUNCTIONS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/exponential_functions.h b/libcxx/include/__cxx03/__math/exponential_functions.h index 109c3349970f6..2e988e1709541 100644 --- a/libcxx/include/__cxx03/__math/exponential_functions.h +++ b/libcxx/include/__cxx03/__math/exponential_functions.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___MATH_EXPONENTIAL_FUNCTIONS_H #define _LIBCPP___MATH_EXPONENTIAL_FUNCTIONS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/fdim.h b/libcxx/include/__cxx03/__math/fdim.h index dc1b4ecc07dce..5f0c8ae34eba3 100644 --- a/libcxx/include/__cxx03/__math/fdim.h +++ b/libcxx/include/__cxx03/__math/fdim.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___MATH_FDIM_H #define _LIBCPP___MATH_FDIM_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/fma.h b/libcxx/include/__cxx03/__math/fma.h index 6ba7a5a2d26d6..b58bc1a9855f7 100644 --- a/libcxx/include/__cxx03/__math/fma.h +++ b/libcxx/include/__cxx03/__math/fma.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___MATH_FMA_H #define _LIBCPP___MATH_FMA_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/gamma.h b/libcxx/include/__cxx03/__math/gamma.h index 693e111a84e99..613ff0104def4 100644 --- a/libcxx/include/__cxx03/__math/gamma.h +++ b/libcxx/include/__cxx03/__math/gamma.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MATH_GAMMA_H #define _LIBCPP___MATH_GAMMA_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/hyperbolic_functions.h b/libcxx/include/__cxx03/__math/hyperbolic_functions.h index 78832bae70c9d..63070ee4f000d 100644 --- a/libcxx/include/__cxx03/__math/hyperbolic_functions.h +++ b/libcxx/include/__cxx03/__math/hyperbolic_functions.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MATH_HYPERBOLIC_FUNCTIONS_H #define _LIBCPP___MATH_HYPERBOLIC_FUNCTIONS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/hypot.h b/libcxx/include/__cxx03/__math/hypot.h index b992163711010..cb032f0fd6991 100644 --- a/libcxx/include/__cxx03/__math/hypot.h +++ b/libcxx/include/__cxx03/__math/hypot.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___MATH_HYPOT_H #define _LIBCPP___MATH_HYPOT_H -#include <__algorithm/max.h> -#include <__config> -#include <__math/abs.h> -#include <__math/exponential_functions.h> -#include <__math/roots.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__config> +#include <__cxx03/__math/abs.h> +#include <__cxx03/__math/exponential_functions.h> +#include <__cxx03/__math/roots.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__math/inverse_hyperbolic_functions.h b/libcxx/include/__cxx03/__math/inverse_hyperbolic_functions.h index 4660a58e4eba0..0701ead8a32f1 100644 --- a/libcxx/include/__cxx03/__math/inverse_hyperbolic_functions.h +++ b/libcxx/include/__cxx03/__math/inverse_hyperbolic_functions.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MATH_INVERSE_HYPERBOLIC_FUNCTIONS_H #define _LIBCPP___MATH_INVERSE_HYPERBOLIC_FUNCTIONS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/inverse_trigonometric_functions.h b/libcxx/include/__cxx03/__math/inverse_trigonometric_functions.h index cd98b46a6aab8..626295321627a 100644 --- a/libcxx/include/__cxx03/__math/inverse_trigonometric_functions.h +++ b/libcxx/include/__cxx03/__math/inverse_trigonometric_functions.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___MATH_INVERSE_TRIGONOMETRIC_FUNCTIONS_H #define _LIBCPP___MATH_INVERSE_TRIGONOMETRIC_FUNCTIONS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/logarithms.h b/libcxx/include/__cxx03/__math/logarithms.h index 5f5f943977a50..ad067abdb3813 100644 --- a/libcxx/include/__cxx03/__math/logarithms.h +++ b/libcxx/include/__cxx03/__math/logarithms.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MATH_LOGARITHMS_H #define _LIBCPP___MATH_LOGARITHMS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/min_max.h b/libcxx/include/__cxx03/__math/min_max.h index 27997b44910a1..717c8cf57e866 100644 --- a/libcxx/include/__cxx03/__math/min_max.h +++ b/libcxx/include/__cxx03/__math/min_max.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___MATH_MIN_MAX_H #define _LIBCPP___MATH_MIN_MAX_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/modulo.h b/libcxx/include/__cxx03/__math/modulo.h index c8ea506f37d75..4fe58b1bf45dd 100644 --- a/libcxx/include/__cxx03/__math/modulo.h +++ b/libcxx/include/__cxx03/__math/modulo.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___MATH_MODULO_H #define _LIBCPP___MATH_MODULO_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/remainder.h b/libcxx/include/__cxx03/__math/remainder.h index 0fbf0b8ef97b9..e7d825f4c8e38 100644 --- a/libcxx/include/__cxx03/__math/remainder.h +++ b/libcxx/include/__cxx03/__math/remainder.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___MATH_REMAINDER_H #define _LIBCPP___MATH_REMAINDER_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/roots.h b/libcxx/include/__cxx03/__math/roots.h index 359fd747cfbef..bb78b70dcaacb 100644 --- a/libcxx/include/__cxx03/__math/roots.h +++ b/libcxx/include/__cxx03/__math/roots.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MATH_ROOTS_H #define _LIBCPP___MATH_ROOTS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/rounding_functions.h b/libcxx/include/__cxx03/__math/rounding_functions.h index f7246ba7fed0d..69b021729e0ac 100644 --- a/libcxx/include/__cxx03/__math/rounding_functions.h +++ b/libcxx/include/__cxx03/__math/rounding_functions.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___MATH_ROUNDING_FUNCTIONS_H #define _LIBCPP___MATH_ROUNDING_FUNCTIONS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/special_functions.h b/libcxx/include/__cxx03/__math/special_functions.h index 0b1c753a659ad..27fb394e803e3 100644 --- a/libcxx/include/__cxx03/__math/special_functions.h +++ b/libcxx/include/__cxx03/__math/special_functions.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___MATH_SPECIAL_FUNCTIONS_H #define _LIBCPP___MATH_SPECIAL_FUNCTIONS_H -#include <__config> -#include <__math/copysign.h> -#include <__math/traits.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__math/copysign.h> +#include <__cxx03/__math/traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/traits.h b/libcxx/include/__cxx03/__math/traits.h index 27ec52ecef022..60823378a835b 100644 --- a/libcxx/include/__cxx03/__math/traits.h +++ b/libcxx/include/__cxx03/__math/traits.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___MATH_TRAITS_H #define _LIBCPP___MATH_TRAITS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_signed.h> -#include <__type_traits/promote.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/promote.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__math/trigonometric_functions.h b/libcxx/include/__cxx03/__math/trigonometric_functions.h index 0ad91c7631609..9d99f409cb207 100644 --- a/libcxx/include/__cxx03/__math/trigonometric_functions.h +++ b/libcxx/include/__cxx03/__math/trigonometric_functions.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MATH_TRIGONOMETRIC_FUNCTIONS_H #define _LIBCPP___MATH_TRIGONOMETRIC_FUNCTIONS_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__mbstate_t.h b/libcxx/include/__cxx03/__mbstate_t.h index bfa6d617e2b8f..302cc5ddc4db2 100644 --- a/libcxx/include/__cxx03/__mbstate_t.h +++ b/libcxx/include/__cxx03/__mbstate_t.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___MBSTATE_T_H #define _LIBCPP___MBSTATE_T_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -22,7 +22,7 @@ // or , hence the #include_next of those headers instead of #include. // (e.g. if isn't present in the C library, the libc++ // will include this header. This header needs to not turn around and cyclically -// include , but fall through to .) +// include <__cxx03/wchar.h>, but fall through to .) // // This does not define std::mbstate_t -- this only brings in the declaration // in the global namespace. diff --git a/libcxx/include/__cxx03/__mdspan/default_accessor.h b/libcxx/include/__cxx03/__mdspan/default_accessor.h index 1cc5f15545fc8..1745f077db701 100644 --- a/libcxx/include/__cxx03/__mdspan/default_accessor.h +++ b/libcxx/include/__cxx03/__mdspan/default_accessor.h @@ -17,20 +17,20 @@ #ifndef _LIBCPP___MDSPAN_DEFAULT_ACCESSOR_H #define _LIBCPP___MDSPAN_DEFAULT_ACCESSOR_H -#include <__config> -#include <__type_traits/is_abstract.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/remove_const.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_abstract.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/cinttypes> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__mdspan/extents.h b/libcxx/include/__cxx03/__mdspan/extents.h index 95082ef3d11ac..b6f2b6abf3f61 100644 --- a/libcxx/include/__cxx03/__mdspan/extents.h +++ b/libcxx/include/__cxx03/__mdspan/extents.h @@ -17,28 +17,28 @@ #ifndef _LIBCPP___MDSPAN_EXTENTS_H #define _LIBCPP___MDSPAN_EXTENTS_H -#include <__assert> -#include <__config> -#include <__type_traits/common_type.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/make_unsigned.h> -#include <__utility/integer_sequence.h> -#include <__utility/unreachable.h> -#include -#include -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/array> +#include <__cxx03/cinttypes> +#include <__cxx03/concepts> +#include <__cxx03/cstddef> +#include <__cxx03/limits> +#include <__cxx03/span> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__mdspan/layout_left.h b/libcxx/include/__cxx03/__mdspan/layout_left.h index d058cbccffd96..017f278ab6db0 100644 --- a/libcxx/include/__cxx03/__mdspan/layout_left.h +++ b/libcxx/include/__cxx03/__mdspan/layout_left.h @@ -17,25 +17,25 @@ #ifndef _LIBCPP___MDSPAN_LAYOUT_LEFT_H #define _LIBCPP___MDSPAN_LAYOUT_LEFT_H -#include <__assert> -#include <__config> -#include <__fwd/mdspan.h> -#include <__mdspan/extents.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__utility/integer_sequence.h> -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__fwd/mdspan.h> +#include <__cxx03/__mdspan/extents.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/array> +#include <__cxx03/cinttypes> +#include <__cxx03/cstddef> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__mdspan/layout_right.h b/libcxx/include/__cxx03/__mdspan/layout_right.h index 6842e9dc37fdc..13d430cf6b5e4 100644 --- a/libcxx/include/__cxx03/__mdspan/layout_right.h +++ b/libcxx/include/__cxx03/__mdspan/layout_right.h @@ -17,24 +17,24 @@ #ifndef _LIBCPP___MDSPAN_LAYOUT_RIGHT_H #define _LIBCPP___MDSPAN_LAYOUT_RIGHT_H -#include <__assert> -#include <__config> -#include <__fwd/mdspan.h> -#include <__mdspan/extents.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__utility/integer_sequence.h> -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__fwd/mdspan.h> +#include <__cxx03/__mdspan/extents.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/cinttypes> +#include <__cxx03/cstddef> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__mdspan/layout_stride.h b/libcxx/include/__cxx03/__mdspan/layout_stride.h index 86148ac849eca..9b387ba6b9f60 100644 --- a/libcxx/include/__cxx03/__mdspan/layout_stride.h +++ b/libcxx/include/__cxx03/__mdspan/layout_stride.h @@ -17,27 +17,27 @@ #ifndef _LIBCPP___MDSPAN_LAYOUT_STRIDE_H #define _LIBCPP___MDSPAN_LAYOUT_STRIDE_H -#include <__assert> -#include <__config> -#include <__fwd/mdspan.h> -#include <__mdspan/extents.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__utility/as_const.h> -#include <__utility/integer_sequence.h> -#include <__utility/swap.h> -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__fwd/mdspan.h> +#include <__cxx03/__mdspan/extents.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__utility/as_const.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/array> +#include <__cxx03/cinttypes> +#include <__cxx03/cstddef> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__mdspan/mdspan.h b/libcxx/include/__cxx03/__mdspan/mdspan.h index 1ff4fd4ba4a82..253ac1cbb1c42 100644 --- a/libcxx/include/__cxx03/__mdspan/mdspan.h +++ b/libcxx/include/__cxx03/__mdspan/mdspan.h @@ -17,37 +17,37 @@ #ifndef _LIBCPP___MDSPAN_MDSPAN_H #define _LIBCPP___MDSPAN_MDSPAN_H -#include <__assert> -#include <__config> -#include <__fwd/mdspan.h> -#include <__mdspan/default_accessor.h> -#include <__mdspan/extents.h> -#include <__type_traits/extent.h> -#include <__type_traits/is_abstract.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_same.h> -#include <__type_traits/rank.h> -#include <__type_traits/remove_all_extents.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_pointer.h> -#include <__type_traits/remove_reference.h> -#include <__utility/integer_sequence.h> -#include -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__fwd/mdspan.h> +#include <__cxx03/__mdspan/default_accessor.h> +#include <__cxx03/__mdspan/extents.h> +#include <__cxx03/__type_traits/extent.h> +#include <__cxx03/__type_traits/is_abstract.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/rank.h> +#include <__cxx03/__type_traits/remove_all_extents.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/array> +#include <__cxx03/cinttypes> +#include <__cxx03/cstddef> +#include <__cxx03/limits> +#include <__cxx03/span> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/addressof.h b/libcxx/include/__cxx03/__memory/addressof.h index fa590212c49b9..0f6b875884448 100644 --- a/libcxx/include/__cxx03/__memory/addressof.h +++ b/libcxx/include/__cxx03/__memory/addressof.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___MEMORY_ADDRESSOF_H #define _LIBCPP___MEMORY_ADDRESSOF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/align.h b/libcxx/include/__cxx03/__memory/align.h index bbb995f4a8c8e..3ef7011bdb62f 100644 --- a/libcxx/include/__cxx03/__memory/align.h +++ b/libcxx/include/__cxx03/__memory/align.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___MEMORY_ALIGN_H #define _LIBCPP___MEMORY_ALIGN_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/aligned_alloc.h b/libcxx/include/__cxx03/__memory/aligned_alloc.h index cb424328bcafc..9e864f5a7ab0d 100644 --- a/libcxx/include/__cxx03/__memory/aligned_alloc.h +++ b/libcxx/include/__cxx03/__memory/aligned_alloc.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MEMORY_ALIGNED_ALLOC_H #define _LIBCPP___MEMORY_ALIGNED_ALLOC_H -#include <__config> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> +#include <__cxx03/cstdlib> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/allocate_at_least.h b/libcxx/include/__cxx03/__memory/allocate_at_least.h index df73d9a2e94aa..e8b4cd3a17e5f 100644 --- a/libcxx/include/__cxx03/__memory/allocate_at_least.h +++ b/libcxx/include/__cxx03/__memory/allocate_at_least.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MEMORY_ALLOCATE_AT_LEAST_H #define _LIBCPP___MEMORY_ALLOCATE_AT_LEAST_H -#include <__config> -#include <__memory/allocator_traits.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/allocation_guard.h b/libcxx/include/__cxx03/__memory/allocation_guard.h index cb870af7be676..a84ab2de7eb9e 100644 --- a/libcxx/include/__cxx03/__memory/allocation_guard.h +++ b/libcxx/include/__cxx03/__memory/allocation_guard.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___MEMORY_ALLOCATION_GUARD_H #define _LIBCPP___MEMORY_ALLOCATION_GUARD_H -#include <__config> -#include <__memory/addressof.h> -#include <__memory/allocator_traits.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/allocator.h b/libcxx/include/__cxx03/__memory/allocator.h index 2d8624e771bce..e0066c6e89b9b 100644 --- a/libcxx/include/__cxx03/__memory/allocator.h +++ b/libcxx/include/__cxx03/__memory/allocator.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___MEMORY_ALLOCATOR_H #define _LIBCPP___MEMORY_ALLOCATOR_H -#include <__config> -#include <__memory/addressof.h> -#include <__memory/allocate_at_least.h> -#include <__memory/allocator_traits.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_void.h> -#include <__type_traits/is_volatile.h> -#include <__utility/forward.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocate_at_least.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/is_volatile.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/allocator_arg_t.h b/libcxx/include/__cxx03/__memory/allocator_arg_t.h index 7e66da740cd4f..5f40454dca757 100644 --- a/libcxx/include/__cxx03/__memory/allocator_arg_t.h +++ b/libcxx/include/__cxx03/__memory/allocator_arg_t.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___FUNCTIONAL_ALLOCATOR_ARG_T_H #define _LIBCPP___FUNCTIONAL_ALLOCATOR_ARG_T_H -#include <__config> -#include <__memory/uses_allocator.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/uses_allocator.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/allocator_destructor.h b/libcxx/include/__cxx03/__memory/allocator_destructor.h index ed3d8918f5fe3..e009cd1d04aa1 100644 --- a/libcxx/include/__cxx03/__memory/allocator_destructor.h +++ b/libcxx/include/__cxx03/__memory/allocator_destructor.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___MEMORY_ALLOCATOR_DESTRUCTOR_H #define _LIBCPP___MEMORY_ALLOCATOR_DESTRUCTOR_H -#include <__config> -#include <__memory/allocator_traits.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/allocator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/allocator_traits.h b/libcxx/include/__cxx03/__memory/allocator_traits.h index c5fcc89327b8f..9833df6c50bd6 100644 --- a/libcxx/include/__cxx03/__memory/allocator_traits.h +++ b/libcxx/include/__cxx03/__memory/allocator_traits.h @@ -10,27 +10,27 @@ #ifndef _LIBCPP___MEMORY_ALLOCATOR_TRAITS_H #define _LIBCPP___MEMORY_ALLOCATOR_TRAITS_H -#include <__config> -#include <__memory/construct_at.h> -#include <__memory/pointer_traits.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_empty.h> -#include <__type_traits/is_same.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/remove_reference.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_empty.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/assume_aligned.h b/libcxx/include/__cxx03/__memory/assume_aligned.h index 526eb3334f958..3b345d4c5c5c4 100644 --- a/libcxx/include/__cxx03/__memory/assume_aligned.h +++ b/libcxx/include/__cxx03/__memory/assume_aligned.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___MEMORY_ASSUME_ALIGNED_H #define _LIBCPP___MEMORY_ASSUME_ALIGNED_H -#include <__assert> -#include <__config> -#include <__type_traits/is_constant_evaluated.h> -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/auto_ptr.h b/libcxx/include/__cxx03/__memory/auto_ptr.h index 752143616bb20..f5695745d22ea 100644 --- a/libcxx/include/__cxx03/__memory/auto_ptr.h +++ b/libcxx/include/__cxx03/__memory/auto_ptr.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___MEMORY_AUTO_PTR_H #define _LIBCPP___MEMORY_AUTO_PTR_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/builtin_new_allocator.h b/libcxx/include/__cxx03/__memory/builtin_new_allocator.h index c6f7f3c5ff52a..e0284dfcff1ee 100644 --- a/libcxx/include/__cxx03/__memory/builtin_new_allocator.h +++ b/libcxx/include/__cxx03/__memory/builtin_new_allocator.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H #define _LIBCPP___MEMORY_BUILTIN_NEW_ALLOCATOR_H -#include <__config> -#include <__memory/unique_ptr.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/cstddef> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/compressed_pair.h b/libcxx/include/__cxx03/__memory/compressed_pair.h index 40e5cfc35fb04..2af34f02772e0 100644 --- a/libcxx/include/__cxx03/__memory/compressed_pair.h +++ b/libcxx/include/__cxx03/__memory/compressed_pair.h @@ -10,28 +10,28 @@ #ifndef _LIBCPP___MEMORY_COMPRESSED_PAIR_H #define _LIBCPP___MEMORY_COMPRESSED_PAIR_H -#include <__config> -#include <__fwd/tuple.h> -#include <__tuple/tuple_indices.h> -#include <__type_traits/decay.h> -#include <__type_traits/dependent_type.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_empty.h> -#include <__type_traits/is_final.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/piecewise_construct.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/tuple_indices.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/dependent_type.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_empty.h> +#include <__cxx03/__type_traits/is_final.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/piecewise_construct.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/concepts.h b/libcxx/include/__cxx03/__memory/concepts.h index 216144aad7480..85620202495eb 100644 --- a/libcxx/include/__cxx03/__memory/concepts.h +++ b/libcxx/include/__cxx03/__memory/concepts.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___MEMORY_CONCEPTS_H #define _LIBCPP___MEMORY_CONCEPTS_H -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/readable_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> // TODO(modules): This should not be required +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> // TODO(modules): This should not be required #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/construct_at.h b/libcxx/include/__cxx03/__memory/construct_at.h index eb02132480064..090e132a67ee5 100644 --- a/libcxx/include/__cxx03/__memory/construct_at.h +++ b/libcxx/include/__cxx03/__memory/construct_at.h @@ -10,24 +10,24 @@ #ifndef _LIBCPP___MEMORY_CONSTRUCT_AT_H #define _LIBCPP___MEMORY_CONSTRUCT_AT_H -#include <__assert> -#include <__config> -#include <__iterator/access.h> -#include <__memory/addressof.h> -#include <__memory/voidify.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_array.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/voidify.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/destruct_n.h b/libcxx/include/__cxx03/__memory/destruct_n.h index 78635ad0af04b..f5a24a67ca52c 100644 --- a/libcxx/include/__cxx03/__memory/destruct_n.h +++ b/libcxx/include/__cxx03/__memory/destruct_n.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___MEMORY_DESTRUCT_N_H #define _LIBCPP___MEMORY_DESTRUCT_N_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_trivially_destructible.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_trivially_destructible.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/inout_ptr.h b/libcxx/include/__cxx03/__memory/inout_ptr.h index e5f3ac5d027e8..bb715e3b86c6c 100644 --- a/libcxx/include/__cxx03/__memory/inout_ptr.h +++ b/libcxx/include/__cxx03/__memory/inout_ptr.h @@ -10,24 +10,24 @@ #ifndef _LIBCPP___INOUT_PTR_H #define _LIBCPP___INOUT_PTR_H -#include <__config> -#include <__memory/addressof.h> -#include <__memory/pointer_traits.h> -#include <__memory/shared_ptr.h> -#include <__memory/unique_ptr.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_specialization.h> -#include <__type_traits/is_void.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_specialization.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/out_ptr.h b/libcxx/include/__cxx03/__memory/out_ptr.h index fd99110790cc8..9aa9f33e293c0 100644 --- a/libcxx/include/__cxx03/__memory/out_ptr.h +++ b/libcxx/include/__cxx03/__memory/out_ptr.h @@ -10,23 +10,23 @@ #ifndef _LIBCPP___OUT_PTR_H #define _LIBCPP___OUT_PTR_H -#include <__config> -#include <__memory/addressof.h> -#include <__memory/pointer_traits.h> -#include <__memory/shared_ptr.h> -#include <__memory/unique_ptr.h> -#include <__type_traits/is_specialization.h> -#include <__type_traits/is_void.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__type_traits/is_specialization.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/pointer_traits.h b/libcxx/include/__cxx03/__memory/pointer_traits.h index 0914aceb318b7..9c480af773b70 100644 --- a/libcxx/include/__cxx03/__memory/pointer_traits.h +++ b/libcxx/include/__cxx03/__memory/pointer_traits.h @@ -10,25 +10,25 @@ #ifndef _LIBCPP___MEMORY_POINTER_TRAITS_H #define _LIBCPP___MEMORY_POINTER_TRAITS_H -#include <__config> -#include <__memory/addressof.h> -#include <__type_traits/conditional.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_class.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_void.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_class.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/ranges_construct_at.h b/libcxx/include/__cxx03/__memory/ranges_construct_at.h index f731e75e7bdc0..f8acc9f0c21d9 100644 --- a/libcxx/include/__cxx03/__memory/ranges_construct_at.h +++ b/libcxx/include/__cxx03/__memory/ranges_construct_at.h @@ -10,26 +10,26 @@ #ifndef _LIBCPP___MEMORY_RANGES_CONSTRUCT_AT_H #define _LIBCPP___MEMORY_RANGES_CONSTRUCT_AT_H -#include <__concepts/destructible.h> -#include <__config> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__memory/concepts.h> -#include <__memory/construct_at.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__concepts/destructible.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/concepts.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/ranges_uninitialized_algorithms.h b/libcxx/include/__cxx03/__memory/ranges_uninitialized_algorithms.h index 90090055bbbbf..f7af434847814 100644 --- a/libcxx/include/__cxx03/__memory/ranges_uninitialized_algorithms.h +++ b/libcxx/include/__cxx03/__memory/ranges_uninitialized_algorithms.h @@ -10,29 +10,29 @@ #ifndef _LIBCPP___MEMORY_RANGES_UNINITIALIZED_ALGORITHMS_H #define _LIBCPP___MEMORY_RANGES_UNINITIALIZED_ALGORITHMS_H -#include <__algorithm/in_out_result.h> -#include <__concepts/constructible.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/readable_traits.h> -#include <__memory/concepts.h> -#include <__memory/uninitialized_algorithms.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__type_traits/remove_reference.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/in_out_result.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__memory/concepts.h> +#include <__cxx03/__memory/uninitialized_algorithms.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/raw_storage_iterator.h b/libcxx/include/__cxx03/__memory/raw_storage_iterator.h index 774878aa1c5e8..bde1e1e6f0304 100644 --- a/libcxx/include/__cxx03/__memory/raw_storage_iterator.h +++ b/libcxx/include/__cxx03/__memory/raw_storage_iterator.h @@ -10,20 +10,20 @@ #ifndef _LIBCPP___MEMORY_RAW_STORAGE_ITERATOR_H #define _LIBCPP___MEMORY_RAW_STORAGE_ITERATOR_H -#include <__config> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__utility/move.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/shared_ptr.h b/libcxx/include/__cxx03/__memory/shared_ptr.h index d487e4fbe3a95..f6322d9141e30 100644 --- a/libcxx/include/__cxx03/__memory/shared_ptr.h +++ b/libcxx/include/__cxx03/__memory/shared_ptr.h @@ -10,50 +10,50 @@ #ifndef _LIBCPP___MEMORY_SHARED_PTR_H #define _LIBCPP___MEMORY_SHARED_PTR_H -#include <__compare/compare_three_way.h> -#include <__compare/ordering.h> -#include <__config> -#include <__exception/exception.h> -#include <__functional/binary_function.h> -#include <__functional/operations.h> -#include <__functional/reference_wrapper.h> -#include <__fwd/ostream.h> -#include <__iterator/access.h> -#include <__memory/addressof.h> -#include <__memory/allocation_guard.h> -#include <__memory/allocator.h> -#include <__memory/allocator_destructor.h> -#include <__memory/allocator_traits.h> -#include <__memory/auto_ptr.h> -#include <__memory/compressed_pair.h> -#include <__memory/construct_at.h> -#include <__memory/pointer_traits.h> -#include <__memory/uninitialized_algorithms.h> -#include <__memory/unique_ptr.h> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/conditional.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/disjunction.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_bounded_array.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_unbounded_array.h> -#include <__type_traits/nat.h> -#include <__type_traits/negation.h> -#include <__type_traits/remove_extent.h> -#include <__type_traits/remove_reference.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include <__verbose_abort> -#include -#include -#include +#include <__cxx03/__compare/compare_three_way.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__functional/binary_function.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__functional/reference_wrapper.h> +#include <__cxx03/__fwd/ostream.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocation_guard.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/allocator_destructor.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/auto_ptr.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/uninitialized_algorithms.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/disjunction.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_bounded_array.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_unbounded_array.h> +#include <__cxx03/__type_traits/nat.h> +#include <__cxx03/__type_traits/negation.h> +#include <__cxx03/__type_traits/remove_extent.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/cstddef> +#include <__cxx03/new> +#include <__cxx03/typeinfo> #if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) -# include <__atomic/memory_order.h> +# include <__cxx03/__atomic/memory_order.h> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -61,7 +61,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/swap_allocator.h b/libcxx/include/__cxx03/__memory/swap_allocator.h index b17e082a43c9f..3b463553d3ff5 100644 --- a/libcxx/include/__cxx03/__memory/swap_allocator.h +++ b/libcxx/include/__cxx03/__memory/swap_allocator.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___MEMORY_SWAP_ALLOCATOR_H #define _LIBCPP___MEMORY_SWAP_ALLOCATOR_H -#include <__config> -#include <__memory/allocator_traits.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_swappable.h> -#include <__utility/swap.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__utility/swap.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/temp_value.h b/libcxx/include/__cxx03/__memory/temp_value.h index 4a133b3fbcf6c..ddf963da45de4 100644 --- a/libcxx/include/__cxx03/__memory/temp_value.h +++ b/libcxx/include/__cxx03/__memory/temp_value.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___MEMORY_TEMP_VALUE_H #define _LIBCPP___MEMORY_TEMP_VALUE_H -#include <__config> -#include <__memory/addressof.h> -#include <__memory/allocator_traits.h> -#include <__type_traits/aligned_storage.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__type_traits/aligned_storage.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/temporary_buffer.h b/libcxx/include/__cxx03/__memory/temporary_buffer.h index 88799ca95c1f3..11a22e6db67d9 100644 --- a/libcxx/include/__cxx03/__memory/temporary_buffer.h +++ b/libcxx/include/__cxx03/__memory/temporary_buffer.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___MEMORY_TEMPORARY_BUFFER_H #define _LIBCPP___MEMORY_TEMPORARY_BUFFER_H -#include <__config> -#include <__utility/pair.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/cstddef> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h b/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h index 7475ef5cf85de..d74304d1d970a 100644 --- a/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h +++ b/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h @@ -10,39 +10,39 @@ #ifndef _LIBCPP___MEMORY_UNINITIALIZED_ALGORITHMS_H #define _LIBCPP___MEMORY_UNINITIALIZED_ALGORITHMS_H -#include <__algorithm/copy.h> -#include <__algorithm/move.h> -#include <__algorithm/unwrap_iter.h> -#include <__algorithm/unwrap_range.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__memory/addressof.h> -#include <__memory/allocator_traits.h> -#include <__memory/construct_at.h> -#include <__memory/pointer_traits.h> -#include <__memory/voidify.h> -#include <__type_traits/extent.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_trivially_assignable.h> -#include <__type_traits/is_trivially_constructible.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/is_unbounded_array.h> -#include <__type_traits/negation.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_extent.h> -#include <__utility/exception_guard.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/move.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__algorithm/unwrap_range.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/voidify.h> +#include <__cxx03/__type_traits/extent.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_trivially_assignable.h> +#include <__cxx03/__type_traits/is_trivially_constructible.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/is_unbounded_array.h> +#include <__cxx03/__type_traits/negation.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_extent.h> +#include <__cxx03/__utility/exception_guard.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/unique_ptr.h b/libcxx/include/__cxx03/__memory/unique_ptr.h index f75259473efb1..2576b6b37e7cb 100644 --- a/libcxx/include/__cxx03/__memory/unique_ptr.h +++ b/libcxx/include/__cxx03/__memory/unique_ptr.h @@ -10,45 +10,45 @@ #ifndef _LIBCPP___MEMORY_UNIQUE_PTR_H #define _LIBCPP___MEMORY_UNIQUE_PTR_H -#include <__compare/compare_three_way.h> -#include <__compare/compare_three_way_result.h> -#include <__compare/three_way_comparable.h> -#include <__config> -#include <__functional/hash.h> -#include <__functional/operations.h> -#include <__memory/allocator_traits.h> // __pointer -#include <__memory/auto_ptr.h> -#include <__memory/compressed_pair.h> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/common_type.h> -#include <__type_traits/conditional.h> -#include <__type_traits/dependent_type.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/is_void.h> -#include <__type_traits/remove_extent.h> -#include <__type_traits/remove_pointer.h> -#include <__type_traits/type_identity.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__compare/compare_three_way.h> +#include <__cxx03/__compare/compare_three_way_result.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__memory/allocator_traits.h> // __pointer +#include <__cxx03/__memory/auto_ptr.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/dependent_type.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/remove_extent.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/uses_allocator.h b/libcxx/include/__cxx03/__memory/uses_allocator.h index 84310c3fa5673..afdaac2d4b6fc 100644 --- a/libcxx/include/__cxx03/__memory/uses_allocator.h +++ b/libcxx/include/__cxx03/__memory/uses_allocator.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___MEMORY_USES_ALLOCATOR_H #define _LIBCPP___MEMORY_USES_ALLOCATOR_H -#include <__config> -#include <__type_traits/is_convertible.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory/uses_allocator_construction.h b/libcxx/include/__cxx03/__memory/uses_allocator_construction.h index 5e5819d4c281e..fae2609204b13 100644 --- a/libcxx/include/__cxx03/__memory/uses_allocator_construction.h +++ b/libcxx/include/__cxx03/__memory/uses_allocator_construction.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___MEMORY_USES_ALLOCATOR_CONSTRUCTION_H #define _LIBCPP___MEMORY_USES_ALLOCATOR_CONSTRUCTION_H -#include <__config> -#include <__memory/construct_at.h> -#include <__memory/uses_allocator.h> -#include <__tuple/tuple_like_no_subrange.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cv.h> -#include <__utility/declval.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/uses_allocator.h> +#include <__cxx03/__tuple/tuple_like_no_subrange.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__memory/voidify.h b/libcxx/include/__cxx03/__memory/voidify.h index dbd083bd8c1e9..b509aecc4a18f 100644 --- a/libcxx/include/__cxx03/__memory/voidify.h +++ b/libcxx/include/__cxx03/__memory/voidify.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___MEMORY_VOIDIFY_H #define _LIBCPP___MEMORY_VOIDIFY_H -#include <__config> -#include <__memory/addressof.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory_resource/memory_resource.h b/libcxx/include/__cxx03/__memory_resource/memory_resource.h index ea85e50cd568b..e4af41a9dc0b0 100644 --- a/libcxx/include/__cxx03/__memory_resource/memory_resource.h +++ b/libcxx/include/__cxx03/__memory_resource/memory_resource.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H #define _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H -#include <__config> -#include <__fwd/memory_resource.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/memory_resource.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory_resource/monotonic_buffer_resource.h b/libcxx/include/__cxx03/__memory_resource/monotonic_buffer_resource.h index f45b30fdb3861..9d418ea2289ae 100644 --- a/libcxx/include/__cxx03/__memory_resource/monotonic_buffer_resource.h +++ b/libcxx/include/__cxx03/__memory_resource/monotonic_buffer_resource.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H #define _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H -#include <__config> -#include <__memory/addressof.h> -#include <__memory_resource/memory_resource.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory_resource/memory_resource.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory_resource/polymorphic_allocator.h b/libcxx/include/__cxx03/__memory_resource/polymorphic_allocator.h index a71096d3e4784..f615ac3811671 100644 --- a/libcxx/include/__cxx03/__memory_resource/polymorphic_allocator.h +++ b/libcxx/include/__cxx03/__memory_resource/polymorphic_allocator.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_POLYMORPHIC_ALLOCATOR_H #define _LIBCPP___MEMORY_RESOURCE_POLYMORPHIC_ALLOCATOR_H -#include <__assert> -#include <__config> -#include <__fwd/pair.h> -#include <__memory_resource/memory_resource.h> -#include <__utility/exception_guard.h> -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__fwd/pair.h> +#include <__cxx03/__memory_resource/memory_resource.h> +#include <__cxx03/__utility/exception_guard.h> +#include <__cxx03/cstddef> +#include <__cxx03/limits> +#include <__cxx03/new> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__cxx03/__memory_resource/pool_options.h b/libcxx/include/__cxx03/__memory_resource/pool_options.h index 442959836c7ef..50f3ea5882981 100644 --- a/libcxx/include/__cxx03/__memory_resource/pool_options.h +++ b/libcxx/include/__cxx03/__memory_resource/pool_options.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_POOL_OPTIONS_H #define _LIBCPP___MEMORY_RESOURCE_POOL_OPTIONS_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory_resource/synchronized_pool_resource.h b/libcxx/include/__cxx03/__memory_resource/synchronized_pool_resource.h index 50a673c2861d1..f139b592eadad 100644 --- a/libcxx/include/__cxx03/__memory_resource/synchronized_pool_resource.h +++ b/libcxx/include/__cxx03/__memory_resource/synchronized_pool_resource.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_SYNCHRONIZED_POOL_RESOURCE_H #define _LIBCPP___MEMORY_RESOURCE_SYNCHRONIZED_POOL_RESOURCE_H -#include <__config> -#include <__memory_resource/memory_resource.h> -#include <__memory_resource/pool_options.h> -#include <__memory_resource/unsynchronized_pool_resource.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__memory_resource/memory_resource.h> +#include <__cxx03/__memory_resource/pool_options.h> +#include <__cxx03/__memory_resource/unsynchronized_pool_resource.h> +#include <__cxx03/cstddef> +#include <__cxx03/mutex> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__memory_resource/unsynchronized_pool_resource.h b/libcxx/include/__cxx03/__memory_resource/unsynchronized_pool_resource.h index 783db84262af7..d3a32da069b7e 100644 --- a/libcxx/include/__cxx03/__memory_resource/unsynchronized_pool_resource.h +++ b/libcxx/include/__cxx03/__memory_resource/unsynchronized_pool_resource.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H #define _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H -#include <__config> -#include <__memory_resource/memory_resource.h> -#include <__memory_resource/pool_options.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__memory_resource/memory_resource.h> +#include <__cxx03/__memory_resource/pool_options.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__mutex/lock_guard.h b/libcxx/include/__cxx03/__mutex/lock_guard.h index ef56896be9f68..b20efc33b2a15 100644 --- a/libcxx/include/__cxx03/__mutex/lock_guard.h +++ b/libcxx/include/__cxx03/__mutex/lock_guard.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___MUTEX_LOCK_GUARD_H #define _LIBCPP___MUTEX_LOCK_GUARD_H -#include <__config> -#include <__mutex/tag_types.h> +#include <__cxx03/__config> +#include <__cxx03/__mutex/tag_types.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__mutex/mutex.h b/libcxx/include/__cxx03/__mutex/mutex.h index 1ed01547126f4..58adf2b1b2b0a 100644 --- a/libcxx/include/__cxx03/__mutex/mutex.h +++ b/libcxx/include/__cxx03/__mutex/mutex.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___MUTEX_MUTEX_H #define _LIBCPP___MUTEX_MUTEX_H -#include <__config> -#include <__thread/support.h> -#include <__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__thread/support.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__mutex/once_flag.h b/libcxx/include/__cxx03/__mutex/once_flag.h index 9d7baecbc7085..9c467f7b274dd 100644 --- a/libcxx/include/__cxx03/__mutex/once_flag.h +++ b/libcxx/include/__cxx03/__mutex/once_flag.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___MUTEX_ONCE_FLAG_H #define _LIBCPP___MUTEX_ONCE_FLAG_H -#include <__config> -#include <__functional/invoke.h> -#include <__memory/shared_ptr.h> // __libcpp_acquire_load -#include <__tuple/tuple_indices.h> -#include <__tuple/tuple_size.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__memory/shared_ptr.h> // __libcpp_acquire_load +#include <__cxx03/__tuple/tuple_indices.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstdint> #ifndef _LIBCPP_CXX03_LANG -# include +# include <__cxx03/tuple> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -26,7 +26,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__mutex/tag_types.h b/libcxx/include/__cxx03/__mutex/tag_types.h index 2b2dd58ee4e80..c9665f1dd3a7a 100644 --- a/libcxx/include/__cxx03/__mutex/tag_types.h +++ b/libcxx/include/__cxx03/__mutex/tag_types.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___MUTEX_TAG_TYPES_H #define _LIBCPP___MUTEX_TAG_TYPES_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__mutex/unique_lock.h b/libcxx/include/__cxx03/__mutex/unique_lock.h index 4a616ba51ee1c..8dd620d0d5a8d 100644 --- a/libcxx/include/__cxx03/__mutex/unique_lock.h +++ b/libcxx/include/__cxx03/__mutex/unique_lock.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___MUTEX_UNIQUE_LOCK_H #define _LIBCPP___MUTEX_UNIQUE_LOCK_H -#include <__chrono/duration.h> -#include <__chrono/time_point.h> -#include <__config> -#include <__memory/addressof.h> -#include <__mutex/tag_types.h> -#include <__system_error/system_error.h> -#include <__utility/swap.h> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__mutex/tag_types.h> +#include <__cxx03/__system_error/system_error.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cerrno> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__node_handle b/libcxx/include/__cxx03/__node_handle index d0b35bfd19340..6b3af1507c7b4 100644 --- a/libcxx/include/__cxx03/__node_handle +++ b/libcxx/include/__cxx03/__node_handle @@ -58,18 +58,18 @@ public: */ -#include <__assert> -#include <__config> -#include <__memory/allocator_traits.h> -#include <__memory/pointer_traits.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/accumulate.h b/libcxx/include/__cxx03/__numeric/accumulate.h index fedc1c46dfd02..243a4b05b97a3 100644 --- a/libcxx/include/__cxx03/__numeric/accumulate.h +++ b/libcxx/include/__cxx03/__numeric/accumulate.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___NUMERIC_ACCUMULATE_H #define _LIBCPP___NUMERIC_ACCUMULATE_H -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/adjacent_difference.h b/libcxx/include/__cxx03/__numeric/adjacent_difference.h index 62b53342d9a41..a07d46a3892c1 100644 --- a/libcxx/include/__cxx03/__numeric/adjacent_difference.h +++ b/libcxx/include/__cxx03/__numeric/adjacent_difference.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___NUMERIC_ADJACENT_DIFFERENCE_H #define _LIBCPP___NUMERIC_ADJACENT_DIFFERENCE_H -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/exclusive_scan.h b/libcxx/include/__cxx03/__numeric/exclusive_scan.h index af85b477dfe68..70b85783ed22f 100644 --- a/libcxx/include/__cxx03/__numeric/exclusive_scan.h +++ b/libcxx/include/__cxx03/__numeric/exclusive_scan.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___NUMERIC_EXCLUSIVE_SCAN_H #define _LIBCPP___NUMERIC_EXCLUSIVE_SCAN_H -#include <__config> -#include <__functional/operations.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/gcd_lcm.h b/libcxx/include/__cxx03/__numeric/gcd_lcm.h index 9be6cf8516b13..4d1a88f23bf46 100644 --- a/libcxx/include/__cxx03/__numeric/gcd_lcm.h +++ b/libcxx/include/__cxx03/__numeric/gcd_lcm.h @@ -10,23 +10,23 @@ #ifndef _LIBCPP___NUMERIC_GCD_LCM_H #define _LIBCPP___NUMERIC_GCD_LCM_H -#include <__algorithm/min.h> -#include <__assert> -#include <__bit/countr.h> -#include <__config> -#include <__type_traits/common_type.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_signed.h> -#include <__type_traits/make_unsigned.h> -#include +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__bit/countr.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/inclusive_scan.h b/libcxx/include/__cxx03/__numeric/inclusive_scan.h index d714f102d74ef..352161cafd498 100644 --- a/libcxx/include/__cxx03/__numeric/inclusive_scan.h +++ b/libcxx/include/__cxx03/__numeric/inclusive_scan.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___NUMERIC_INCLUSIVE_SCAN_H #define _LIBCPP___NUMERIC_INCLUSIVE_SCAN_H -#include <__config> -#include <__functional/operations.h> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__numeric/inner_product.h b/libcxx/include/__cxx03/__numeric/inner_product.h index 0deab3d421b77..a5369438c7e3f 100644 --- a/libcxx/include/__cxx03/__numeric/inner_product.h +++ b/libcxx/include/__cxx03/__numeric/inner_product.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___NUMERIC_INNER_PRODUCT_H #define _LIBCPP___NUMERIC_INNER_PRODUCT_H -#include <__config> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/iota.h b/libcxx/include/__cxx03/__numeric/iota.h index 27bd84e395a3a..fe9fb9fc37da4 100644 --- a/libcxx/include/__cxx03/__numeric/iota.h +++ b/libcxx/include/__cxx03/__numeric/iota.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___NUMERIC_IOTA_H #define _LIBCPP___NUMERIC_IOTA_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__numeric/midpoint.h b/libcxx/include/__cxx03/__numeric/midpoint.h index 5ef30d4ec50f5..2244354c79300 100644 --- a/libcxx/include/__cxx03/__numeric/midpoint.h +++ b/libcxx/include/__cxx03/__numeric/midpoint.h @@ -10,26 +10,26 @@ #ifndef _LIBCPP___NUMERIC_MIDPOINT_H #define _LIBCPP___NUMERIC_MIDPOINT_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_null_pointer.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_void.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/remove_pointer.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_null_pointer.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/cstddef> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/partial_sum.h b/libcxx/include/__cxx03/__numeric/partial_sum.h index 7c3cf7fb20b04..1f9aa36233dd0 100644 --- a/libcxx/include/__cxx03/__numeric/partial_sum.h +++ b/libcxx/include/__cxx03/__numeric/partial_sum.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___NUMERIC_PARTIAL_SUM_H #define _LIBCPP___NUMERIC_PARTIAL_SUM_H -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/pstl.h b/libcxx/include/__cxx03/__numeric/pstl.h index 7557686a3663d..365f6fabb1476 100644 --- a/libcxx/include/__cxx03/__numeric/pstl.h +++ b/libcxx/include/__cxx03/__numeric/pstl.h @@ -9,29 +9,29 @@ #ifndef _LIBCPP___NUMERIC_PSTL_H #define _LIBCPP___NUMERIC_PSTL_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 -# include <__functional/identity.h> -# include <__functional/operations.h> -# include <__iterator/cpp17_iterator_concepts.h> -# include <__iterator/iterator_traits.h> -# include <__pstl/backend.h> -# include <__pstl/dispatch.h> -# include <__pstl/handle_exception.h> -# include <__type_traits/enable_if.h> -# include <__type_traits/is_execution_policy.h> -# include <__type_traits/remove_cvref.h> -# include <__utility/forward.h> -# include <__utility/move.h> +# include <__cxx03/__functional/identity.h> +# include <__cxx03/__functional/operations.h> +# include <__cxx03/__iterator/cpp17_iterator_concepts.h> +# include <__cxx03/__iterator/iterator_traits.h> +# include <__cxx03/__pstl/backend.h> +# include <__cxx03/__pstl/dispatch.h> +# include <__cxx03/__pstl/handle_exception.h> +# include <__cxx03/__type_traits/enable_if.h> +# include <__cxx03/__type_traits/is_execution_policy.h> +# include <__cxx03/__type_traits/remove_cvref.h> +# include <__cxx03/__utility/forward.h> +# include <__cxx03/__utility/move.h> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/reduce.h b/libcxx/include/__cxx03/__numeric/reduce.h index 6c205bf581fb9..039954498043a 100644 --- a/libcxx/include/__cxx03/__numeric/reduce.h +++ b/libcxx/include/__cxx03/__numeric/reduce.h @@ -10,17 +10,17 @@ #ifndef _LIBCPP___NUMERIC_REDUCE_H #define _LIBCPP___NUMERIC_REDUCE_H -#include <__config> -#include <__functional/operations.h> -#include <__iterator/iterator_traits.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/saturation_arithmetic.h b/libcxx/include/__cxx03/__numeric/saturation_arithmetic.h index 2390b42aaec31..972428e36a778 100644 --- a/libcxx/include/__cxx03/__numeric/saturation_arithmetic.h +++ b/libcxx/include/__cxx03/__numeric/saturation_arithmetic.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H #define _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H -#include <__assert> -#include <__concepts/arithmetic.h> -#include <__config> -#include <__utility/cmp.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__utility/cmp.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__numeric/transform_exclusive_scan.h b/libcxx/include/__cxx03/__numeric/transform_exclusive_scan.h index fb88aa52cd105..5857c75eb2094 100644 --- a/libcxx/include/__cxx03/__numeric/transform_exclusive_scan.h +++ b/libcxx/include/__cxx03/__numeric/transform_exclusive_scan.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___NUMERIC_TRANSFORM_EXCLUSIVE_SCAN_H #define _LIBCPP___NUMERIC_TRANSFORM_EXCLUSIVE_SCAN_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__numeric/transform_inclusive_scan.h b/libcxx/include/__cxx03/__numeric/transform_inclusive_scan.h index 2eab1a142439a..31f7d52b4261f 100644 --- a/libcxx/include/__cxx03/__numeric/transform_inclusive_scan.h +++ b/libcxx/include/__cxx03/__numeric/transform_inclusive_scan.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___NUMERIC_TRANSFORM_INCLUSIVE_SCAN_H #define _LIBCPP___NUMERIC_TRANSFORM_INCLUSIVE_SCAN_H -#include <__config> -#include <__iterator/iterator_traits.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__numeric/transform_reduce.h b/libcxx/include/__cxx03/__numeric/transform_reduce.h index f1150510f0c36..5e494ff5d1b78 100644 --- a/libcxx/include/__cxx03/__numeric/transform_reduce.h +++ b/libcxx/include/__cxx03/__numeric/transform_reduce.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___NUMERIC_TRANSFORM_REDUCE_H #define _LIBCPP___NUMERIC_TRANSFORM_REDUCE_H -#include <__config> -#include <__functional/operations.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ostream/basic_ostream.h b/libcxx/include/__cxx03/__ostream/basic_ostream.h index 178359d681567..1f0fb9acd3871 100644 --- a/libcxx/include/__cxx03/__ostream/basic_ostream.h +++ b/libcxx/include/__cxx03/__ostream/basic_ostream.h @@ -9,30 +9,30 @@ #ifndef _LIBCPP___OSTREAM_BASIC_OSTREAM_H #define _LIBCPP___OSTREAM_BASIC_OSTREAM_H -#include <__config> -#include <__exception/operations.h> -#include <__memory/shared_ptr.h> -#include <__memory/unique_ptr.h> -#include <__system_error/error_code.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_base_of.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include -#include -#include -#include -#include // for __throw_bad_alloc -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__exception/operations.h> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__system_error/error_code.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_base_of.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/bitset> +#include <__cxx03/cstddef> +#include <__cxx03/ios> +#include <__cxx03/locale> +#include <__cxx03/new> // for __throw_bad_alloc +#include <__cxx03/streambuf> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ostream/print.h b/libcxx/include/__cxx03/__ostream/print.h index 8265ac00777e2..b183081405184 100644 --- a/libcxx/include/__cxx03/__ostream/print.h +++ b/libcxx/include/__cxx03/__ostream/print.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___OSTREAM_PRINT_H #define _LIBCPP___OSTREAM_PRINT_H -#include <__config> -#include <__fwd/ostream.h> -#include <__iterator/ostreambuf_iterator.h> -#include <__ostream/basic_ostream.h> -#include -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/ostream.h> +#include <__cxx03/__iterator/ostreambuf_iterator.h> +#include <__cxx03/__ostream/basic_ostream.h> +#include <__cxx03/format> +#include <__cxx03/ios> +#include <__cxx03/locale> +#include <__cxx03/print> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__pstl/backend.h b/libcxx/include/__cxx03/__pstl/backend.h index 86d9f28c77fa8..9bc243746df0d 100644 --- a/libcxx/include/__cxx03/__pstl/backend.h +++ b/libcxx/include/__cxx03/__pstl/backend.h @@ -9,25 +9,25 @@ #ifndef _LIBCPP___PSTL_BACKEND_H #define _LIBCPP___PSTL_BACKEND_H -#include <__config> -#include <__pstl/backend_fwd.h> +#include <__cxx03/__config> +#include <__cxx03/__pstl/backend_fwd.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if defined(_LIBCPP_PSTL_BACKEND_SERIAL) -# include <__pstl/backends/default.h> -# include <__pstl/backends/serial.h> +# include <__cxx03/__pstl/backends/default.h> +# include <__cxx03/__pstl/backends/serial.h> #elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) -# include <__pstl/backends/default.h> -# include <__pstl/backends/std_thread.h> +# include <__cxx03/__pstl/backends/default.h> +# include <__cxx03/__pstl/backends/std_thread.h> #elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) -# include <__pstl/backends/default.h> -# include <__pstl/backends/libdispatch.h> +# include <__cxx03/__pstl/backends/default.h> +# include <__cxx03/__pstl/backends/libdispatch.h> #endif _LIBCPP_POP_MACROS diff --git a/libcxx/include/__cxx03/__pstl/backend_fwd.h b/libcxx/include/__cxx03/__pstl/backend_fwd.h index 32c5da576fb3c..56c8c043e4d2c 100644 --- a/libcxx/include/__cxx03/__pstl/backend_fwd.h +++ b/libcxx/include/__cxx03/__pstl/backend_fwd.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___PSTL_BACKEND_FWD_H #define _LIBCPP___PSTL_BACKEND_FWD_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> // // This header declares available PSTL backends and the functions that must be implemented in order for the diff --git a/libcxx/include/__cxx03/__pstl/backends/default.h b/libcxx/include/__cxx03/__pstl/backends/default.h index 61a128805f854..2ad388ae50d33 100644 --- a/libcxx/include/__cxx03/__pstl/backends/default.h +++ b/libcxx/include/__cxx03/__pstl/backends/default.h @@ -9,29 +9,29 @@ #ifndef _LIBCPP___PSTL_BACKENDS_DEFAULT_H #define _LIBCPP___PSTL_BACKENDS_DEFAULT_H -#include <__algorithm/copy_n.h> -#include <__algorithm/equal.h> -#include <__algorithm/fill_n.h> -#include <__algorithm/for_each_n.h> -#include <__config> -#include <__functional/identity.h> -#include <__functional/not_fn.h> -#include <__functional/operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/dispatch.h> -#include <__utility/empty.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__algorithm/for_each_n.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__functional/not_fn.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/dispatch.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/backends/libdispatch.h b/libcxx/include/__cxx03/__pstl/backends/libdispatch.h index a0c3ad980ed1b..fb811917025a2 100644 --- a/libcxx/include/__cxx03/__pstl/backends/libdispatch.h +++ b/libcxx/include/__cxx03/__pstl/backends/libdispatch.h @@ -9,40 +9,40 @@ #ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H #define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H -#include <__algorithm/inplace_merge.h> -#include <__algorithm/lower_bound.h> -#include <__algorithm/max.h> -#include <__algorithm/merge.h> -#include <__algorithm/upper_bound.h> -#include <__atomic/atomic.h> -#include <__config> -#include <__exception/terminate.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/move_iterator.h> -#include <__memory/allocator.h> -#include <__memory/construct_at.h> -#include <__memory/unique_ptr.h> -#include <__numeric/reduce.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/any_of.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__pstl/cpu_algos/fill.h> -#include <__pstl/cpu_algos/find_if.h> -#include <__pstl/cpu_algos/for_each.h> -#include <__pstl/cpu_algos/merge.h> -#include <__pstl/cpu_algos/stable_sort.h> -#include <__pstl/cpu_algos/transform.h> -#include <__pstl/cpu_algos/transform_reduce.h> -#include <__utility/empty.h> -#include <__utility/exception_guard.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include -#include -#include +#include <__cxx03/__algorithm/inplace_merge.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__algorithm/merge.h> +#include <__cxx03/__algorithm/upper_bound.h> +#include <__cxx03/__atomic/atomic.h> +#include <__cxx03/__config> +#include <__cxx03/__exception/terminate.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/move_iterator.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__numeric/reduce.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/any_of.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__pstl/cpu_algos/fill.h> +#include <__cxx03/__pstl/cpu_algos/find_if.h> +#include <__cxx03/__pstl/cpu_algos/for_each.h> +#include <__cxx03/__pstl/cpu_algos/merge.h> +#include <__cxx03/__pstl/cpu_algos/stable_sort.h> +#include <__cxx03/__pstl/cpu_algos/transform.h> +#include <__cxx03/__pstl/cpu_algos/transform_reduce.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/__utility/exception_guard.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/cstddef> +#include <__cxx03/new> +#include <__cxx03/optional> _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/backends/serial.h b/libcxx/include/__cxx03/__pstl/backends/serial.h index 5f24499899bd2..699c1035d9113 100644 --- a/libcxx/include/__cxx03/__pstl/backends/serial.h +++ b/libcxx/include/__cxx03/__pstl/backends/serial.h @@ -10,25 +10,25 @@ #ifndef _LIBCPP___PSTL_BACKENDS_SERIAL_H #define _LIBCPP___PSTL_BACKENDS_SERIAL_H -#include <__algorithm/find_if.h> -#include <__algorithm/for_each.h> -#include <__algorithm/merge.h> -#include <__algorithm/stable_sort.h> -#include <__algorithm/transform.h> -#include <__config> -#include <__numeric/transform_reduce.h> -#include <__pstl/backend_fwd.h> -#include <__utility/empty.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/find_if.h> +#include <__cxx03/__algorithm/for_each.h> +#include <__cxx03/__algorithm/merge.h> +#include <__cxx03/__algorithm/stable_sort.h> +#include <__cxx03/__algorithm/transform.h> +#include <__cxx03/__config> +#include <__cxx03/__numeric/transform_reduce.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/backends/std_thread.h b/libcxx/include/__cxx03/__pstl/backends/std_thread.h index 49570bd30b082..06cd1a1c03c46 100644 --- a/libcxx/include/__cxx03/__pstl/backends/std_thread.h +++ b/libcxx/include/__cxx03/__pstl/backends/std_thread.h @@ -9,28 +9,28 @@ #ifndef _LIBCPP___PSTL_BACKENDS_STD_THREAD_H #define _LIBCPP___PSTL_BACKENDS_STD_THREAD_H -#include <__config> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/any_of.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__pstl/cpu_algos/fill.h> -#include <__pstl/cpu_algos/find_if.h> -#include <__pstl/cpu_algos/for_each.h> -#include <__pstl/cpu_algos/merge.h> -#include <__pstl/cpu_algos/stable_sort.h> -#include <__pstl/cpu_algos/transform.h> -#include <__pstl/cpu_algos/transform_reduce.h> -#include <__utility/empty.h> -#include <__utility/move.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/any_of.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__pstl/cpu_algos/fill.h> +#include <__cxx03/__pstl/cpu_algos/find_if.h> +#include <__cxx03/__pstl/cpu_algos/for_each.h> +#include <__cxx03/__pstl/cpu_algos/merge.h> +#include <__cxx03/__pstl/cpu_algos/stable_sort.h> +#include <__cxx03/__pstl/cpu_algos/transform.h> +#include <__cxx03/__pstl/cpu_algos/transform_reduce.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/any_of.h b/libcxx/include/__cxx03/__pstl/cpu_algos/any_of.h index b33c787a29db2..b1b51d1679bf2 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/any_of.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/any_of.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_ANY_OF_H #define _LIBCPP___PSTL_CPU_ALGOS_ANY_OF_H -#include <__algorithm/any_of.h> -#include <__assert> -#include <__atomic/atomic.h> -#include <__atomic/memory_order.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__type_traits/is_execution_policy.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include -#include +#include <__cxx03/__algorithm/any_of.h> +#include <__cxx03/__assert> +#include <__cxx03/__atomic/atomic.h> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/cstdint> +#include <__cxx03/optional> _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__cxx03/__pstl/cpu_algos/cpu_traits.h index 0483d6918fd01..9538b5c94957d 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/cpu_traits.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/cpu_traits.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H #define _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/fill.h b/libcxx/include/__cxx03/__pstl/cpu_algos/fill.h index 4e6d29b30cc69..19fac90c44010 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/fill.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/fill.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_FILL_H #define _LIBCPP___PSTL_CPU_ALGOS_FILL_H -#include <__algorithm/fill.h> -#include <__assert> -#include <__config> -#include <__iterator/concepts.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__type_traits/is_execution_policy.h> -#include <__utility/empty.h> -#include +#include <__cxx03/__algorithm/fill.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/find_if.h b/libcxx/include/__cxx03/__pstl/cpu_algos/find_if.h index 12b2e88971df7..2be1aad7ae6be 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/find_if.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/find_if.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_FIND_IF_H #define _LIBCPP___PSTL_CPU_ALGOS_FIND_IF_H -#include <__algorithm/find_if.h> -#include <__assert> -#include <__atomic/atomic.h> -#include <__config> -#include <__functional/operations.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__type_traits/is_execution_policy.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include -#include +#include <__cxx03/__algorithm/find_if.h> +#include <__cxx03/__assert> +#include <__cxx03/__atomic/atomic.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/cstddef> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/for_each.h b/libcxx/include/__cxx03/__pstl/cpu_algos/for_each.h index d4d7862135ff9..4f6f2e87342a9 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/for_each.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/for_each.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_FOR_EACH_H #define _LIBCPP___PSTL_CPU_ALGOS_FOR_EACH_H -#include <__algorithm/for_each.h> -#include <__assert> -#include <__config> -#include <__iterator/concepts.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__type_traits/is_execution_policy.h> -#include <__utility/empty.h> -#include +#include <__cxx03/__algorithm/for_each.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/merge.h b/libcxx/include/__cxx03/__pstl/cpu_algos/merge.h index dfa4cbf69b147..0feb510dc60f7 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/merge.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/merge.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_MERGE_H #define _LIBCPP___PSTL_CPU_ALGOS_MERGE_H -#include <__algorithm/merge.h> -#include <__assert> -#include <__config> -#include <__iterator/concepts.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__type_traits/is_execution_policy.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/merge.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/stable_sort.h b/libcxx/include/__cxx03/__pstl/cpu_algos/stable_sort.h index 8e64f3e537c07..63be0c7ac1669 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/stable_sort.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/stable_sort.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_STABLE_SORT_H #define _LIBCPP___PSTL_CPU_ALGOS_STABLE_SORT_H -#include <__algorithm/stable_sort.h> -#include <__config> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__type_traits/is_execution_policy.h> -#include <__utility/empty.h> -#include +#include <__cxx03/__algorithm/stable_sort.h> +#include <__cxx03/__config> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/transform.h b/libcxx/include/__cxx03/__pstl/cpu_algos/transform.h index 27ce8e27b242a..d82bbb588a998 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/transform.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/transform.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_H #define _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_H -#include <__algorithm/transform.h> -#include <__assert> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__type_traits/is_execution_policy.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/transform.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/cpu_algos/transform_reduce.h b/libcxx/include/__cxx03/__pstl/cpu_algos/transform_reduce.h index 36ac1a9072a89..52f0ea361b3c1 100644 --- a/libcxx/include/__cxx03/__pstl/cpu_algos/transform_reduce.h +++ b/libcxx/include/__cxx03/__pstl/cpu_algos/transform_reduce.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_REDUCE_H #define _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_REDUCE_H -#include <__assert> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__numeric/transform_reduce.h> -#include <__pstl/backend_fwd.h> -#include <__pstl/cpu_algos/cpu_traits.h> -#include <__type_traits/desugars_to.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_execution_policy.h> -#include <__utility/move.h> -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__numeric/transform_reduce.h> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__pstl/cpu_algos/cpu_traits.h> +#include <__cxx03/__type_traits/desugars_to.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/new> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/dispatch.h b/libcxx/include/__cxx03/__pstl/dispatch.h index 5e903f7524fe9..e223e59bb129c 100644 --- a/libcxx/include/__cxx03/__pstl/dispatch.h +++ b/libcxx/include/__cxx03/__pstl/dispatch.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___PSTL_DISPATCH_H #define _LIBCPP___PSTL_DISPATCH_H -#include <__config> -#include <__pstl/backend_fwd.h> -#include <__type_traits/conditional.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/type_identity.h> +#include <__cxx03/__config> +#include <__cxx03/__pstl/backend_fwd.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/type_identity.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__pstl/handle_exception.h b/libcxx/include/__cxx03/__pstl/handle_exception.h index d6270958c3a7c..562617a1786b3 100644 --- a/libcxx/include/__cxx03/__pstl/handle_exception.h +++ b/libcxx/include/__cxx03/__pstl/handle_exception.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___PSTL_HANDLE_EXCEPTION_H #define _LIBCPP___PSTL_HANDLE_EXCEPTION_H -#include <__config> -#include <__utility/forward.h> -#include <__utility/move.h> -#include // __throw_bad_alloc -#include +#include <__cxx03/__config> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/new> // __throw_bad_alloc +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { diff --git a/libcxx/include/__cxx03/__random/bernoulli_distribution.h b/libcxx/include/__cxx03/__random/bernoulli_distribution.h index 4f33dca132d10..28536442fd1d3 100644 --- a/libcxx/include/__cxx03/__random/bernoulli_distribution.h +++ b/libcxx/include/__cxx03/__random/bernoulli_distribution.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___RANDOM_BERNOULLI_DISTRIBUTION_H #define _LIBCPP___RANDOM_BERNOULLI_DISTRIBUTION_H -#include <__config> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/iosfwd> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/binomial_distribution.h b/libcxx/include/__cxx03/__random/binomial_distribution.h index e8774bb8d67ee..3ca98e84082e1 100644 --- a/libcxx/include/__cxx03/__random/binomial_distribution.h +++ b/libcxx/include/__cxx03/__random/binomial_distribution.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___RANDOM_BINOMIAL_DISTRIBUTION_H #define _LIBCPP___RANDOM_BINOMIAL_DISTRIBUTION_H -#include <__config> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/cauchy_distribution.h b/libcxx/include/__cxx03/__random/cauchy_distribution.h index bd341427a1523..b84ba03703143 100644 --- a/libcxx/include/__cxx03/__random/cauchy_distribution.h +++ b/libcxx/include/__cxx03/__random/cauchy_distribution.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___RANDOM_CAUCHY_DISTRIBUTION_H #define _LIBCPP___RANDOM_CAUCHY_DISTRIBUTION_H -#include <__config> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/chi_squared_distribution.h b/libcxx/include/__cxx03/__random/chi_squared_distribution.h index efa96dcdaafb5..94ba5555ce41d 100644 --- a/libcxx/include/__cxx03/__random/chi_squared_distribution.h +++ b/libcxx/include/__cxx03/__random/chi_squared_distribution.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___RANDOM_CHI_SQUARED_DISTRIBUTION_H #define _LIBCPP___RANDOM_CHI_SQUARED_DISTRIBUTION_H -#include <__config> -#include <__random/gamma_distribution.h> -#include <__random/is_valid.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/gamma_distribution.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/clamp_to_integral.h b/libcxx/include/__cxx03/__random/clamp_to_integral.h index d9bfd31b7f012..c10783aab2c07 100644 --- a/libcxx/include/__cxx03/__random/clamp_to_integral.h +++ b/libcxx/include/__cxx03/__random/clamp_to_integral.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___RANDOM_CLAMP_TO_INTEGRAL_H #define _LIBCPP___RANDOM_CLAMP_TO_INTEGRAL_H -#include <__config> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/cmath> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/default_random_engine.h b/libcxx/include/__cxx03/__random/default_random_engine.h index 89792f4f0d43e..10eb00b3594ff 100644 --- a/libcxx/include/__cxx03/__random/default_random_engine.h +++ b/libcxx/include/__cxx03/__random/default_random_engine.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___RANDOM_DEFAULT_RANDOM_ENGINE_H #define _LIBCPP___RANDOM_DEFAULT_RANDOM_ENGINE_H -#include <__config> -#include <__random/linear_congruential_engine.h> +#include <__cxx03/__config> +#include <__cxx03/__random/linear_congruential_engine.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__random/discard_block_engine.h b/libcxx/include/__cxx03/__random/discard_block_engine.h index 07f599067279e..2a5f877b79caa 100644 --- a/libcxx/include/__cxx03/__random/discard_block_engine.h +++ b/libcxx/include/__cxx03/__random/discard_block_engine.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___RANDOM_DISCARD_BLOCK_ENGINE_H #define _LIBCPP___RANDOM_DISCARD_BLOCK_ENGINE_H -#include <__config> -#include <__random/is_seed_sequence.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_convertible.h> -#include <__utility/move.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_seed_sequence.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/discrete_distribution.h b/libcxx/include/__cxx03/__random/discrete_distribution.h index bb72dd6cb5074..40b8bfb61dccf 100644 --- a/libcxx/include/__cxx03/__random/discrete_distribution.h +++ b/libcxx/include/__cxx03/__random/discrete_distribution.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___RANDOM_DISCRETE_DISTRIBUTION_H #define _LIBCPP___RANDOM_DISCRETE_DISTRIBUTION_H -#include <__algorithm/upper_bound.h> -#include <__config> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include -#include +#include <__cxx03/__algorithm/upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cstddef> +#include <__cxx03/iosfwd> +#include <__cxx03/numeric> +#include <__cxx03/vector> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/exponential_distribution.h b/libcxx/include/__cxx03/__random/exponential_distribution.h index e0e38841172f9..6f49c31a1bd10 100644 --- a/libcxx/include/__cxx03/__random/exponential_distribution.h +++ b/libcxx/include/__cxx03/__random/exponential_distribution.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___RANDOM_EXPONENTIAL_DISTRIBUTION_H #define _LIBCPP___RANDOM_EXPONENTIAL_DISTRIBUTION_H -#include <__config> -#include <__random/generate_canonical.h> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/generate_canonical.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/extreme_value_distribution.h b/libcxx/include/__cxx03/__random/extreme_value_distribution.h index 5505f93274f5c..46c466d620d50 100644 --- a/libcxx/include/__cxx03/__random/extreme_value_distribution.h +++ b/libcxx/include/__cxx03/__random/extreme_value_distribution.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___RANDOM_EXTREME_VALUE_DISTRIBUTION_H #define _LIBCPP___RANDOM_EXTREME_VALUE_DISTRIBUTION_H -#include <__config> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/fisher_f_distribution.h b/libcxx/include/__cxx03/__random/fisher_f_distribution.h index cd170b3af388f..45ab47df4021b 100644 --- a/libcxx/include/__cxx03/__random/fisher_f_distribution.h +++ b/libcxx/include/__cxx03/__random/fisher_f_distribution.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___RANDOM_FISHER_F_DISTRIBUTION_H #define _LIBCPP___RANDOM_FISHER_F_DISTRIBUTION_H -#include <__config> -#include <__random/gamma_distribution.h> -#include <__random/is_valid.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/gamma_distribution.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/gamma_distribution.h b/libcxx/include/__cxx03/__random/gamma_distribution.h index 986e42c1c7f5b..694c7219452a8 100644 --- a/libcxx/include/__cxx03/__random/gamma_distribution.h +++ b/libcxx/include/__cxx03/__random/gamma_distribution.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___RANDOM_GAMMA_DISTRIBUTION_H #define _LIBCPP___RANDOM_GAMMA_DISTRIBUTION_H -#include <__config> -#include <__random/exponential_distribution.h> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/exponential_distribution.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/generate_canonical.h b/libcxx/include/__cxx03/__random/generate_canonical.h index 738de1517e286..363f118c4f7d6 100644 --- a/libcxx/include/__cxx03/__random/generate_canonical.h +++ b/libcxx/include/__cxx03/__random/generate_canonical.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___RANDOM_GENERATE_CANONICAL_H #define _LIBCPP___RANDOM_GENERATE_CANONICAL_H -#include <__config> -#include <__random/log2.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/log2.h> +#include <__cxx03/cstdint> +#include <__cxx03/initializer_list> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/geometric_distribution.h b/libcxx/include/__cxx03/__random/geometric_distribution.h index cecd7e57cfa64..3fe06792da856 100644 --- a/libcxx/include/__cxx03/__random/geometric_distribution.h +++ b/libcxx/include/__cxx03/__random/geometric_distribution.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___RANDOM_GEOMETRIC_DISTRIBUTION_H #define _LIBCPP___RANDOM_GEOMETRIC_DISTRIBUTION_H -#include <__config> -#include <__random/is_valid.h> -#include <__random/negative_binomial_distribution.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/negative_binomial_distribution.h> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/independent_bits_engine.h b/libcxx/include/__cxx03/__random/independent_bits_engine.h index 0f4a7b82b98f8..bf2350e7b0d6f 100644 --- a/libcxx/include/__cxx03/__random/independent_bits_engine.h +++ b/libcxx/include/__cxx03/__random/independent_bits_engine.h @@ -9,24 +9,24 @@ #ifndef _LIBCPP___RANDOM_INDEPENDENT_BITS_ENGINE_H #define _LIBCPP___RANDOM_INDEPENDENT_BITS_ENGINE_H -#include <__config> -#include <__fwd/istream.h> -#include <__fwd/ostream.h> -#include <__random/is_seed_sequence.h> -#include <__random/log2.h> -#include <__type_traits/conditional.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_convertible.h> -#include <__utility/move.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/istream.h> +#include <__cxx03/__fwd/ostream.h> +#include <__cxx03/__random/is_seed_sequence.h> +#include <__cxx03/__random/log2.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/is_seed_sequence.h b/libcxx/include/__cxx03/__random/is_seed_sequence.h index c7171cff2eda0..a924857c7ffa7 100644 --- a/libcxx/include/__cxx03/__random/is_seed_sequence.h +++ b/libcxx/include/__cxx03/__random/is_seed_sequence.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___RANDOM_IS_SEED_SEQUENCE_H #define _LIBCPP___RANDOM_IS_SEED_SEQUENCE_H -#include <__config> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__random/is_valid.h b/libcxx/include/__cxx03/__random/is_valid.h index a3e0f143ae86a..fc32a22bf608c 100644 --- a/libcxx/include/__cxx03/__random/is_valid.h +++ b/libcxx/include/__cxx03/__random/is_valid.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___RANDOM_IS_VALID_H #define _LIBCPP___RANDOM_IS_VALID_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_unsigned.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__random/knuth_b.h b/libcxx/include/__cxx03/__random/knuth_b.h index f5b31cb64fa4a..b8df0078ebb1a 100644 --- a/libcxx/include/__cxx03/__random/knuth_b.h +++ b/libcxx/include/__cxx03/__random/knuth_b.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___RANDOM_KNUTH_B_H #define _LIBCPP___RANDOM_KNUTH_B_H -#include <__config> -#include <__random/linear_congruential_engine.h> -#include <__random/shuffle_order_engine.h> +#include <__cxx03/__config> +#include <__cxx03/__random/linear_congruential_engine.h> +#include <__cxx03/__random/shuffle_order_engine.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__random/linear_congruential_engine.h b/libcxx/include/__cxx03/__random/linear_congruential_engine.h index 9d77649e9cfc8..94d140ae1a2d0 100644 --- a/libcxx/include/__cxx03/__random/linear_congruential_engine.h +++ b/libcxx/include/__cxx03/__random/linear_congruential_engine.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___RANDOM_LINEAR_CONGRUENTIAL_ENGINE_H #define _LIBCPP___RANDOM_LINEAR_CONGRUENTIAL_ENGINE_H -#include <__config> -#include <__random/is_seed_sequence.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_unsigned.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_seed_sequence.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/cstdint> +#include <__cxx03/iosfwd> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/log2.h b/libcxx/include/__cxx03/__random/log2.h index 74b4889c6402b..5dbaace22d86e 100644 --- a/libcxx/include/__cxx03/__random/log2.h +++ b/libcxx/include/__cxx03/__random/log2.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___RANDOM_LOG2_H #define _LIBCPP___RANDOM_LOG2_H -#include <__config> -#include <__type_traits/conditional.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__random/lognormal_distribution.h b/libcxx/include/__cxx03/__random/lognormal_distribution.h index d8724f8bc5cec..f1e537a21eb65 100644 --- a/libcxx/include/__cxx03/__random/lognormal_distribution.h +++ b/libcxx/include/__cxx03/__random/lognormal_distribution.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___RANDOM_LOGNORMAL_DISTRIBUTION_H #define _LIBCPP___RANDOM_LOGNORMAL_DISTRIBUTION_H -#include <__config> -#include <__random/is_valid.h> -#include <__random/normal_distribution.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/normal_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/mersenne_twister_engine.h b/libcxx/include/__cxx03/__random/mersenne_twister_engine.h index 65280d7c5505f..16deb8189dc46 100644 --- a/libcxx/include/__cxx03/__random/mersenne_twister_engine.h +++ b/libcxx/include/__cxx03/__random/mersenne_twister_engine.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___RANDOM_MERSENNE_TWISTER_ENGINE_H #define _LIBCPP___RANDOM_MERSENNE_TWISTER_ENGINE_H -#include <__algorithm/equal.h> -#include <__algorithm/min.h> -#include <__config> -#include <__random/is_seed_sequence.h> -#include -#include -#include -#include +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__config> +#include <__cxx03/__random/is_seed_sequence.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/negative_binomial_distribution.h b/libcxx/include/__cxx03/__random/negative_binomial_distribution.h index 6d0055d01ed43..6a99c87f5296b 100644 --- a/libcxx/include/__cxx03/__random/negative_binomial_distribution.h +++ b/libcxx/include/__cxx03/__random/negative_binomial_distribution.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___RANDOM_NEGATIVE_BINOMIAL_DISTRIBUTION_H #define _LIBCPP___RANDOM_NEGATIVE_BINOMIAL_DISTRIBUTION_H -#include <__assert> -#include <__config> -#include <__random/bernoulli_distribution.h> -#include <__random/gamma_distribution.h> -#include <__random/is_valid.h> -#include <__random/poisson_distribution.h> -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__random/bernoulli_distribution.h> +#include <__cxx03/__random/gamma_distribution.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/poisson_distribution.h> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/normal_distribution.h b/libcxx/include/__cxx03/__random/normal_distribution.h index 889f189e4161b..95e4f5da49eb4 100644 --- a/libcxx/include/__cxx03/__random/normal_distribution.h +++ b/libcxx/include/__cxx03/__random/normal_distribution.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___RANDOM_NORMAL_DISTRIBUTION_H #define _LIBCPP___RANDOM_NORMAL_DISTRIBUTION_H -#include <__config> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/piecewise_constant_distribution.h b/libcxx/include/__cxx03/__random/piecewise_constant_distribution.h index e19380f97c35e..5bd53b81cbfce 100644 --- a/libcxx/include/__cxx03/__random/piecewise_constant_distribution.h +++ b/libcxx/include/__cxx03/__random/piecewise_constant_distribution.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___RANDOM_PIECEWISE_CONSTANT_DISTRIBUTION_H #define _LIBCPP___RANDOM_PIECEWISE_CONSTANT_DISTRIBUTION_H -#include <__algorithm/upper_bound.h> -#include <__config> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include +#include <__cxx03/__algorithm/upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/iosfwd> +#include <__cxx03/numeric> +#include <__cxx03/vector> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/piecewise_linear_distribution.h b/libcxx/include/__cxx03/__random/piecewise_linear_distribution.h index 43769dc825e65..faf845321fec8 100644 --- a/libcxx/include/__cxx03/__random/piecewise_linear_distribution.h +++ b/libcxx/include/__cxx03/__random/piecewise_linear_distribution.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___RANDOM_PIECEWISE_LINEAR_DISTRIBUTION_H #define _LIBCPP___RANDOM_PIECEWISE_LINEAR_DISTRIBUTION_H -#include <__algorithm/upper_bound.h> -#include <__config> -#include <__random/is_valid.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include +#include <__cxx03/__algorithm/upper_bound.h> +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/vector> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/poisson_distribution.h b/libcxx/include/__cxx03/__random/poisson_distribution.h index 61a092ef9dd4d..7fb4b9681d70d 100644 --- a/libcxx/include/__cxx03/__random/poisson_distribution.h +++ b/libcxx/include/__cxx03/__random/poisson_distribution.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___RANDOM_POISSON_DISTRIBUTION_H #define _LIBCPP___RANDOM_POISSON_DISTRIBUTION_H -#include <__config> -#include <__random/clamp_to_integral.h> -#include <__random/exponential_distribution.h> -#include <__random/is_valid.h> -#include <__random/normal_distribution.h> -#include <__random/uniform_real_distribution.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/clamp_to_integral.h> +#include <__cxx03/__random/exponential_distribution.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/normal_distribution.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/random_device.h b/libcxx/include/__cxx03/__random/random_device.h index 52407943d2ec7..2f95979bf5491 100644 --- a/libcxx/include/__cxx03/__random/random_device.h +++ b/libcxx/include/__cxx03/__random/random_device.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___RANDOM_RANDOM_DEVICE_H #define _LIBCPP___RANDOM_RANDOM_DEVICE_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/string> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/ranlux.h b/libcxx/include/__cxx03/__random/ranlux.h index 952afde91b109..e99773cae392f 100644 --- a/libcxx/include/__cxx03/__random/ranlux.h +++ b/libcxx/include/__cxx03/__random/ranlux.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___RANDOM_RANLUX_H #define _LIBCPP___RANDOM_RANLUX_H -#include <__config> -#include <__random/discard_block_engine.h> -#include <__random/subtract_with_carry_engine.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__random/discard_block_engine.h> +#include <__cxx03/__random/subtract_with_carry_engine.h> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__random/seed_seq.h b/libcxx/include/__cxx03/__random/seed_seq.h index 5cf84aeb8a72b..8ccd656b47e88 100644 --- a/libcxx/include/__cxx03/__random/seed_seq.h +++ b/libcxx/include/__cxx03/__random/seed_seq.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___RANDOM_SEED_SEQ_H #define _LIBCPP___RANDOM_SEED_SEQ_H -#include <__algorithm/copy.h> -#include <__algorithm/fill.h> -#include <__algorithm/max.h> -#include <__config> -#include <__iterator/iterator_traits.h> -#include <__type_traits/is_unsigned.h> -#include -#include -#include +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/fill.h> +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/cstdint> +#include <__cxx03/initializer_list> +#include <__cxx03/vector> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/shuffle_order_engine.h b/libcxx/include/__cxx03/__random/shuffle_order_engine.h index f54ed17e38383..8eb57c9ec56db 100644 --- a/libcxx/include/__cxx03/__random/shuffle_order_engine.h +++ b/libcxx/include/__cxx03/__random/shuffle_order_engine.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___RANDOM_SHUFFLE_ORDER_ENGINE_H #define _LIBCPP___RANDOM_SHUFFLE_ORDER_ENGINE_H -#include <__algorithm/equal.h> -#include <__config> -#include <__random/is_seed_sequence.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_convertible.h> -#include <__utility/move.h> -#include -#include -#include +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__config> +#include <__cxx03/__random/is_seed_sequence.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/iosfwd> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/student_t_distribution.h b/libcxx/include/__cxx03/__random/student_t_distribution.h index 110a856ee6586..32a1c3c7381ff 100644 --- a/libcxx/include/__cxx03/__random/student_t_distribution.h +++ b/libcxx/include/__cxx03/__random/student_t_distribution.h @@ -9,20 +9,20 @@ #ifndef _LIBCPP___RANDOM_STUDENT_T_DISTRIBUTION_H #define _LIBCPP___RANDOM_STUDENT_T_DISTRIBUTION_H -#include <__config> -#include <__random/gamma_distribution.h> -#include <__random/is_valid.h> -#include <__random/normal_distribution.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/gamma_distribution.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/normal_distribution.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/subtract_with_carry_engine.h b/libcxx/include/__cxx03/__random/subtract_with_carry_engine.h index ec25fed49f949..c539994be1c80 100644 --- a/libcxx/include/__cxx03/__random/subtract_with_carry_engine.h +++ b/libcxx/include/__cxx03/__random/subtract_with_carry_engine.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___RANDOM_SUBTRACT_WITH_CARRY_ENGINE_H #define _LIBCPP___RANDOM_SUBTRACT_WITH_CARRY_ENGINE_H -#include <__algorithm/equal.h> -#include <__algorithm/min.h> -#include <__config> -#include <__random/is_seed_sequence.h> -#include <__random/linear_congruential_engine.h> -#include -#include -#include -#include +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__config> +#include <__cxx03/__random/is_seed_sequence.h> +#include <__cxx03/__random/linear_congruential_engine.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/uniform_int_distribution.h b/libcxx/include/__cxx03/__random/uniform_int_distribution.h index 4e3ca3efe5686..8e509fad15774 100644 --- a/libcxx/include/__cxx03/__random/uniform_int_distribution.h +++ b/libcxx/include/__cxx03/__random/uniform_int_distribution.h @@ -9,23 +9,23 @@ #ifndef _LIBCPP___RANDOM_UNIFORM_INT_DISTRIBUTION_H #define _LIBCPP___RANDOM_UNIFORM_INT_DISTRIBUTION_H -#include <__bit/countl.h> -#include <__config> -#include <__random/is_valid.h> -#include <__random/log2.h> -#include <__type_traits/conditional.h> -#include <__type_traits/make_unsigned.h> -#include -#include -#include -#include +#include <__cxx03/__bit/countl.h> +#include <__cxx03/__config> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/__random/log2.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/uniform_random_bit_generator.h b/libcxx/include/__cxx03/__random/uniform_random_bit_generator.h index 4076f19b2cb2c..74bbea38bf207 100644 --- a/libcxx/include/__cxx03/__random/uniform_random_bit_generator.h +++ b/libcxx/include/__cxx03/__random/uniform_random_bit_generator.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___RANDOM_UNIFORM_RANDOM_BIT_GENERATOR_H #define _LIBCPP___RANDOM_UNIFORM_RANDOM_BIT_GENERATOR_H -#include <__concepts/arithmetic.h> -#include <__concepts/invocable.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/invoke.h> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/uniform_real_distribution.h b/libcxx/include/__cxx03/__random/uniform_real_distribution.h index 250cb8bab58cf..64c978906a4ce 100644 --- a/libcxx/include/__cxx03/__random/uniform_real_distribution.h +++ b/libcxx/include/__cxx03/__random/uniform_real_distribution.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___RANDOM_UNIFORM_REAL_DISTRIBUTION_H #define _LIBCPP___RANDOM_UNIFORM_REAL_DISTRIBUTION_H -#include <__config> -#include <__random/generate_canonical.h> -#include <__random/is_valid.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/generate_canonical.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__random/weibull_distribution.h b/libcxx/include/__cxx03/__random/weibull_distribution.h index aa3d63c8e8663..6c5937aaad9e7 100644 --- a/libcxx/include/__cxx03/__random/weibull_distribution.h +++ b/libcxx/include/__cxx03/__random/weibull_distribution.h @@ -9,19 +9,19 @@ #ifndef _LIBCPP___RANDOM_WEIBULL_DISTRIBUTION_H #define _LIBCPP___RANDOM_WEIBULL_DISTRIBUTION_H -#include <__config> -#include <__random/exponential_distribution.h> -#include <__random/is_valid.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__random/exponential_distribution.h> +#include <__cxx03/__random/is_valid.h> +#include <__cxx03/cmath> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/access.h b/libcxx/include/__cxx03/__ranges/access.h index c0a40c5e10178..a6a0470afeacd 100644 --- a/libcxx/include/__cxx03/__ranges/access.h +++ b/libcxx/include/__cxx03/__ranges/access.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___RANGES_ACCESS_H #define _LIBCPP___RANGES_ACCESS_H -#include <__concepts/class_or_enum.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/readable_traits.h> -#include <__ranges/enable_borrowed_range.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__utility/auto_cast.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/auto_cast.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/all.h b/libcxx/include/__cxx03/__ranges/all.h index 023cee6caa9a2..0cb834fc41a35 100644 --- a/libcxx/include/__cxx03/__ranges/all.h +++ b/libcxx/include/__cxx03/__ranges/all.h @@ -10,20 +10,20 @@ #ifndef _LIBCPP___RANGES_ALL_H #define _LIBCPP___RANGES_ALL_H -#include <__config> -#include <__functional/compose.h> // TODO(modules): Those should not be required -#include <__functional/perfect_forward.h> // -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/owning_view.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/ref_view.h> -#include <__type_traits/decay.h> -#include <__utility/auto_cast.h> -#include <__utility/declval.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/compose.h> // TODO(modules): Those should not be required +#include <__cxx03/__functional/perfect_forward.h> // +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/owning_view.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/ref_view.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__utility/auto_cast.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/as_rvalue_view.h b/libcxx/include/__cxx03/__ranges/as_rvalue_view.h index 5849a6c368396..011939f19ab0f 100644 --- a/libcxx/include/__cxx03/__ranges/as_rvalue_view.h +++ b/libcxx/include/__cxx03/__ranges/as_rvalue_view.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___RANGES_AS_RVALUE_H #define _LIBCPP___RANGES_AS_RVALUE_H -#include <__concepts/constructible.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/move_iterator.h> -#include <__iterator/move_sentinel.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/size.h> -#include <__ranges/view_interface.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/move_iterator.h> +#include <__cxx03/__iterator/move_sentinel.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 23 diff --git a/libcxx/include/__cxx03/__ranges/chunk_by_view.h b/libcxx/include/__cxx03/__ranges/chunk_by_view.h index 00014d9f10ae8..8f17d56e5e93b 100644 --- a/libcxx/include/__cxx03/__ranges/chunk_by_view.h +++ b/libcxx/include/__cxx03/__ranges/chunk_by_view.h @@ -10,41 +10,41 @@ #ifndef _LIBCPP___RANGES_CHUNK_BY_VIEW_H #define _LIBCPP___RANGES_CHUNK_BY_VIEW_H -#include <__algorithm/ranges_adjacent_find.h> -#include <__assert> -#include <__concepts/constructible.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/prev.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/movable_box.h> -#include <__ranges/non_propagating_cache.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/reverse_view.h> -#include <__ranges/subrange.h> -#include <__ranges/view_interface.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_object.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_adjacent_find.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/prev.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__ranges/non_propagating_cache.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/reverse_view.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/common_view.h b/libcxx/include/__cxx03/__ranges/common_view.h index 133236dd1d78a..5af8de542dad9 100644 --- a/libcxx/include/__cxx03/__ranges/common_view.h +++ b/libcxx/include/__cxx03/__ranges/common_view.h @@ -10,27 +10,27 @@ #ifndef _LIBCPP___RANGES_COMMON_VIEW_H #define _LIBCPP___RANGES_COMMON_VIEW_H -#include <__concepts/constructible.h> -#include <__concepts/copyable.h> -#include <__config> -#include <__iterator/common_iterator.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/size.h> -#include <__ranges/view_interface.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/common_iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/concepts.h b/libcxx/include/__cxx03/__ranges/concepts.h index 674a3f359ff99..238d02cc2f68f 100644 --- a/libcxx/include/__cxx03/__ranges/concepts.h +++ b/libcxx/include/__cxx03/__ranges/concepts.h @@ -10,26 +10,26 @@ #ifndef _LIBCPP___RANGES_CONCEPTS_H #define _LIBCPP___RANGES_CONCEPTS_H -#include <__concepts/constructible.h> -#include <__concepts/movable.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/readable_traits.h> -#include <__ranges/access.h> -#include <__ranges/data.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/enable_view.h> -#include <__ranges/size.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/movable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/data.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/enable_view.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/container_compatible_range.h b/libcxx/include/__cxx03/__ranges/container_compatible_range.h index a58f1119885e3..7bf8218e667d0 100644 --- a/libcxx/include/__cxx03/__ranges/container_compatible_range.h +++ b/libcxx/include/__cxx03/__ranges/container_compatible_range.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___RANGES_CONTAINER_COMPATIBLE_RANGE_H #define _LIBCPP___RANGES_CONTAINER_COMPATIBLE_RANGE_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__ranges/concepts.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__ranges/concepts.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/counted.h b/libcxx/include/__cxx03/__ranges/counted.h index e365deca4e632..a20a4824b03cd 100644 --- a/libcxx/include/__cxx03/__ranges/counted.h +++ b/libcxx/include/__cxx03/__ranges/counted.h @@ -10,27 +10,27 @@ #ifndef _LIBCPP___RANGES_COUNTED_H #define _LIBCPP___RANGES_COUNTED_H -#include <__concepts/convertible_to.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/counted_iterator.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__memory/pointer_traits.h> -#include <__ranges/subrange.h> -#include <__type_traits/decay.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include -#include +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/counted_iterator.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/span> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/dangling.h b/libcxx/include/__cxx03/__ranges/dangling.h index 613084d5fb9fc..fee2712ac829b 100644 --- a/libcxx/include/__cxx03/__ranges/dangling.h +++ b/libcxx/include/__cxx03/__ranges/dangling.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___RANGES_DANGLING_H #define _LIBCPP___RANGES_DANGLING_H -#include <__config> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__type_traits/conditional.h> +#include <__cxx03/__config> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/conditional.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/data.h b/libcxx/include/__cxx03/__ranges/data.h index 50db3cffeeed8..2e38f210804da 100644 --- a/libcxx/include/__cxx03/__ranges/data.h +++ b/libcxx/include/__cxx03/__ranges/data.h @@ -10,19 +10,19 @@ #ifndef _LIBCPP___RANGES_DATA_H #define _LIBCPP___RANGES_DATA_H -#include <__concepts/class_or_enum.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__memory/pointer_traits.h> -#include <__ranges/access.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_pointer.h> -#include <__type_traits/remove_reference.h> -#include <__utility/auto_cast.h> +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/auto_cast.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/drop_view.h b/libcxx/include/__cxx03/__ranges/drop_view.h index 853e22a402cad..9d1336fc877ed 100644 --- a/libcxx/include/__cxx03/__ranges/drop_view.h +++ b/libcxx/include/__cxx03/__ranges/drop_view.h @@ -10,46 +10,46 @@ #ifndef _LIBCPP___RANGES_DROP_VIEW_H #define _LIBCPP___RANGES_DROP_VIEW_H -#include <__algorithm/min.h> -#include <__assert> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__fwd/span.h> -#include <__fwd/string_view.h> -#include <__iterator/concepts.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/empty_view.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/iota_view.h> -#include <__ranges/non_propagating_cache.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/repeat_view.h> -#include <__ranges/size.h> -#include <__ranges/subrange.h> -#include <__ranges/view_interface.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/auto_cast.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__fwd/span.h> +#include <__cxx03/__fwd/string_view.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/empty_view.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/iota_view.h> +#include <__cxx03/__ranges/non_propagating_cache.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/repeat_view.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/auto_cast.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/drop_while_view.h b/libcxx/include/__cxx03/__ranges/drop_while_view.h index 92f48bd0ecfba..0542908a6cb5a 100644 --- a/libcxx/include/__cxx03/__ranges/drop_while_view.h +++ b/libcxx/include/__cxx03/__ranges/drop_while_view.h @@ -10,35 +10,35 @@ #ifndef _LIBCPP___RANGES_DROP_WHILE_VIEW_H #define _LIBCPP___RANGES_DROP_WHILE_VIEW_H -#include <__algorithm/ranges_find_if_not.h> -#include <__assert> -#include <__concepts/constructible.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__functional/reference_wrapper.h> -#include <__iterator/concepts.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/movable_box.h> -#include <__ranges/non_propagating_cache.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/view_interface.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_object.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_find_if_not.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__functional/reference_wrapper.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__ranges/non_propagating_cache.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/elements_view.h b/libcxx/include/__cxx03/__ranges/elements_view.h index f159f53dc0a83..515562719039b 100644 --- a/libcxx/include/__cxx03/__ranges/elements_view.h +++ b/libcxx/include/__cxx03/__ranges/elements_view.h @@ -10,41 +10,41 @@ #ifndef _LIBCPP___RANGES_ELEMENTS_VIEW_H #define _LIBCPP___RANGES_ELEMENTS_VIEW_H -#include <__compare/three_way_comparable.h> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/derived_from.h> -#include <__concepts/equality_comparable.h> -#include <__config> -#include <__fwd/complex.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/size.h> -#include <__ranges/view_interface.h> -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_like.h> -#include <__tuple/tuple_size.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/maybe_const.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/complex.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_like.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/maybe_const.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/empty.h b/libcxx/include/__cxx03/__ranges/empty.h index 5c1004042aba5..957fed827b404 100644 --- a/libcxx/include/__cxx03/__ranges/empty.h +++ b/libcxx/include/__cxx03/__ranges/empty.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___RANGES_EMPTY_H #define _LIBCPP___RANGES_EMPTY_H -#include <__concepts/class_or_enum.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__ranges/access.h> -#include <__ranges/size.h> +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/size.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/empty_view.h b/libcxx/include/__cxx03/__ranges/empty_view.h index 6c04b0200c35f..265575baf678d 100644 --- a/libcxx/include/__cxx03/__ranges/empty_view.h +++ b/libcxx/include/__cxx03/__ranges/empty_view.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___RANGES_EMPTY_VIEW_H #define _LIBCPP___RANGES_EMPTY_VIEW_H -#include <__config> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/view_interface.h> -#include <__type_traits/is_object.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/enable_borrowed_range.h b/libcxx/include/__cxx03/__ranges/enable_borrowed_range.h index 1d068335e20af..81051f67c003c 100644 --- a/libcxx/include/__cxx03/__ranges/enable_borrowed_range.h +++ b/libcxx/include/__cxx03/__ranges/enable_borrowed_range.h @@ -14,7 +14,7 @@ // separate header is used to avoid including the entire header in // and . -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/enable_view.h b/libcxx/include/__cxx03/__ranges/enable_view.h index f570926eb67c3..4697dab872bcf 100644 --- a/libcxx/include/__cxx03/__ranges/enable_view.h +++ b/libcxx/include/__cxx03/__ranges/enable_view.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___RANGES_ENABLE_VIEW_H #define _LIBCPP___RANGES_ENABLE_VIEW_H -#include <__concepts/derived_from.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__type_traits/is_class.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_class.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/filter_view.h b/libcxx/include/__cxx03/__ranges/filter_view.h index 5b938dd4c16e1..5015140c39ae7 100644 --- a/libcxx/include/__cxx03/__ranges/filter_view.h +++ b/libcxx/include/__cxx03/__ranges/filter_view.h @@ -10,42 +10,42 @@ #ifndef _LIBCPP___RANGES_FILTER_VIEW_H #define _LIBCPP___RANGES_FILTER_VIEW_H -#include <__algorithm/ranges_find_if.h> -#include <__assert> -#include <__concepts/constructible.h> -#include <__concepts/copyable.h> -#include <__concepts/derived_from.h> -#include <__concepts/equality_comparable.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__functional/invoke.h> -#include <__functional/reference_wrapper.h> -#include <__iterator/concepts.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/movable_box.h> -#include <__ranges/non_propagating_cache.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/view_interface.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_object.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_find_if.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/reference_wrapper.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__ranges/non_propagating_cache.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/from_range.h b/libcxx/include/__cxx03/__ranges/from_range.h index a6cb9e3d439eb..55ff79edd513d 100644 --- a/libcxx/include/__cxx03/__ranges/from_range.h +++ b/libcxx/include/__cxx03/__ranges/from_range.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___RANGES_FROM_RANGE_H #define _LIBCPP___RANGES_FROM_RANGE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/iota_view.h b/libcxx/include/__cxx03/__ranges/iota_view.h index b2fa958a0f56e..5bc2394dc3dc2 100644 --- a/libcxx/include/__cxx03/__ranges/iota_view.h +++ b/libcxx/include/__cxx03/__ranges/iota_view.h @@ -10,38 +10,38 @@ #ifndef _LIBCPP___RANGES_IOTA_VIEW_H #define _LIBCPP___RANGES_IOTA_VIEW_H -#include <__assert> -#include <__compare/three_way_comparable.h> -#include <__concepts/arithmetic.h> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/copyable.h> -#include <__concepts/equality_comparable.h> -#include <__concepts/invocable.h> -#include <__concepts/same_as.h> -#include <__concepts/semiregular.h> -#include <__concepts/totally_ordered.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/unreachable_sentinel.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/movable_box.h> -#include <__ranges/view_interface.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/type_identity.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__assert> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__concepts/semiregular.h> +#include <__cxx03/__concepts/totally_ordered.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/unreachable_sentinel.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/istream_view.h b/libcxx/include/__cxx03/__ranges/istream_view.h index cd7096d35c2c1..8db34132af59c 100644 --- a/libcxx/include/__cxx03/__ranges/istream_view.h +++ b/libcxx/include/__cxx03/__ranges/istream_view.h @@ -10,19 +10,19 @@ #ifndef _LIBCPP___RANGES_ISTREAM_VIEW_H #define _LIBCPP___RANGES_ISTREAM_VIEW_H -#include <__concepts/constructible.h> -#include <__concepts/derived_from.h> -#include <__concepts/movable.h> -#include <__config> -#include <__fwd/istream.h> -#include <__fwd/string.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__ranges/view_interface.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/movable.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/istream.h> +#include <__cxx03/__fwd/string.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/join_view.h b/libcxx/include/__cxx03/__ranges/join_view.h index 9c2c77995539b..ea47eeefcf951 100644 --- a/libcxx/include/__cxx03/__ranges/join_view.h +++ b/libcxx/include/__cxx03/__ranges/join_view.h @@ -10,39 +10,39 @@ #ifndef _LIBCPP___RANGES_JOIN_VIEW_H #define _LIBCPP___RANGES_JOIN_VIEW_H -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/copyable.h> -#include <__concepts/derived_from.h> -#include <__concepts/equality_comparable.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/iterator_with_data.h> -#include <__iterator/segmented_iterator.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/empty.h> -#include <__ranges/non_propagating_cache.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/view_interface.h> -#include <__type_traits/common_type.h> -#include <__type_traits/maybe_const.h> -#include <__utility/as_lvalue.h> -#include <__utility/empty.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/iterator_with_data.h> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/empty.h> +#include <__cxx03/__ranges/non_propagating_cache.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/maybe_const.h> +#include <__cxx03/__utility/as_lvalue.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/lazy_split_view.h b/libcxx/include/__cxx03/__ranges/lazy_split_view.h index db031fe5f8b49..9e70c237b4fb2 100644 --- a/libcxx/include/__cxx03/__ranges/lazy_split_view.h +++ b/libcxx/include/__cxx03/__ranges/lazy_split_view.h @@ -10,45 +10,45 @@ #ifndef _LIBCPP___RANGES_LAZY_SPLIT_VIEW_H #define _LIBCPP___RANGES_LAZY_SPLIT_VIEW_H -#include <__algorithm/ranges_find.h> -#include <__algorithm/ranges_mismatch.h> -#include <__assert> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/derived_from.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__functional/ranges_operations.h> -#include <__iterator/concepts.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/non_propagating_cache.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/single_view.h> -#include <__ranges/subrange.h> -#include <__ranges/view_interface.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/maybe_const.h> -#include <__type_traits/remove_reference.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_find.h> +#include <__cxx03/__algorithm/ranges_mismatch.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/non_propagating_cache.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/single_view.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/maybe_const.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/movable_box.h b/libcxx/include/__cxx03/__ranges/movable_box.h index 5a456cc3a1b66..ab6f407ed537b 100644 --- a/libcxx/include/__cxx03/__ranges/movable_box.h +++ b/libcxx/include/__cxx03/__ranges/movable_box.h @@ -10,22 +10,22 @@ #ifndef _LIBCPP___RANGES_MOVABLE_BOX_H #define _LIBCPP___RANGES_MOVABLE_BOX_H -#include <__concepts/constructible.h> -#include <__concepts/copyable.h> -#include <__concepts/movable.h> -#include <__config> -#include <__memory/addressof.h> -#include <__memory/construct_at.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__utility/move.h> -#include +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/movable.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/non_propagating_cache.h b/libcxx/include/__cxx03/__ranges/non_propagating_cache.h index b2de2d2ae1cb5..6f8c8600bab10 100644 --- a/libcxx/include/__cxx03/__ranges/non_propagating_cache.h +++ b/libcxx/include/__cxx03/__ranges/non_propagating_cache.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___RANGES_NON_PROPAGATING_CACHE_H #define _LIBCPP___RANGES_NON_PROPAGATING_CACHE_H -#include <__config> -#include <__iterator/concepts.h> // indirectly_readable -#include <__iterator/iterator_traits.h> // iter_reference_t -#include <__memory/addressof.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> // indirectly_readable +#include <__cxx03/__iterator/iterator_traits.h> // iter_reference_t +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/owning_view.h b/libcxx/include/__cxx03/__ranges/owning_view.h index 254bdb4329119..ab7e4e3eeb974 100644 --- a/libcxx/include/__cxx03/__ranges/owning_view.h +++ b/libcxx/include/__cxx03/__ranges/owning_view.h @@ -10,25 +10,25 @@ #ifndef _LIBCPP___RANGES_OWNING_VIEW_H #define _LIBCPP___RANGES_OWNING_VIEW_H -#include <__concepts/constructible.h> -#include <__concepts/movable.h> -#include <__config> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/data.h> -#include <__ranges/empty.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/size.h> -#include <__ranges/view_interface.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/movable.h> +#include <__cxx03/__config> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/data.h> +#include <__cxx03/__ranges/empty.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/range_adaptor.h b/libcxx/include/__cxx03/__ranges/range_adaptor.h index 2da246f24e1d2..ef7301d695848 100644 --- a/libcxx/include/__cxx03/__ranges/range_adaptor.h +++ b/libcxx/include/__cxx03/__ranges/range_adaptor.h @@ -10,27 +10,27 @@ #ifndef _LIBCPP___RANGES_RANGE_ADAPTOR_H #define _LIBCPP___RANGES_RANGE_ADAPTOR_H -#include <__concepts/constructible.h> -#include <__concepts/derived_from.h> -#include <__concepts/invocable.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/compose.h> -#include <__functional/invoke.h> -#include <__ranges/concepts.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_class.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/compose.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_class.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/rbegin.h b/libcxx/include/__cxx03/__ranges/rbegin.h index 12e739e1a2b85..3e114605baf37 100644 --- a/libcxx/include/__cxx03/__ranges/rbegin.h +++ b/libcxx/include/__cxx03/__ranges/rbegin.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___RANGES_RBEGIN_H #define _LIBCPP___RANGES_RBEGIN_H -#include <__concepts/class_or_enum.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/readable_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__ranges/access.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__utility/auto_cast.h> +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/auto_cast.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/ref_view.h b/libcxx/include/__cxx03/__ranges/ref_view.h index 5329d778dd30d..9fd2835cf385c 100644 --- a/libcxx/include/__cxx03/__ranges/ref_view.h +++ b/libcxx/include/__cxx03/__ranges/ref_view.h @@ -10,23 +10,23 @@ #ifndef _LIBCPP___RANGES_REF_VIEW_H #define _LIBCPP___RANGES_REF_VIEW_H -#include <__concepts/convertible_to.h> -#include <__concepts/different_from.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/data.h> -#include <__ranges/empty.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/size.h> -#include <__ranges/view_interface.h> -#include <__type_traits/is_object.h> -#include <__utility/declval.h> -#include <__utility/forward.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/different_from.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/data.h> +#include <__cxx03/__ranges/empty.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/rend.h b/libcxx/include/__cxx03/__ranges/rend.h index 02b4c5999a7eb..9d663e54f4619 100644 --- a/libcxx/include/__cxx03/__ranges/rend.h +++ b/libcxx/include/__cxx03/__ranges/rend.h @@ -10,19 +10,19 @@ #ifndef _LIBCPP___RANGES_REND_H #define _LIBCPP___RANGES_REND_H -#include <__concepts/class_or_enum.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/readable_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__ranges/access.h> -#include <__ranges/rbegin.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__utility/auto_cast.h> +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/readable_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/rbegin.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/auto_cast.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/repeat_view.h b/libcxx/include/__cxx03/__ranges/repeat_view.h index 53e4beb270ad0..fae883e01bcfe 100644 --- a/libcxx/include/__cxx03/__ranges/repeat_view.h +++ b/libcxx/include/__cxx03/__ranges/repeat_view.h @@ -10,34 +10,34 @@ #ifndef _LIBCPP___RANGES_REPEAT_VIEW_H #define _LIBCPP___RANGES_REPEAT_VIEW_H -#include <__assert> -#include <__concepts/constructible.h> -#include <__concepts/same_as.h> -#include <__concepts/semiregular.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/unreachable_sentinel.h> -#include <__memory/addressof.h> -#include <__ranges/iota_view.h> -#include <__ranges/movable_box.h> -#include <__ranges/view_interface.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_object.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/remove_cv.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> -#include <__utility/piecewise_construct.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__concepts/semiregular.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/unreachable_sentinel.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/iota_view.h> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/piecewise_construct.h> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/reverse_view.h b/libcxx/include/__cxx03/__ranges/reverse_view.h index 796f5be22328b..7513114779c5a 100644 --- a/libcxx/include/__cxx03/__ranges/reverse_view.h +++ b/libcxx/include/__cxx03/__ranges/reverse_view.h @@ -10,31 +10,31 @@ #ifndef _LIBCPP___RANGES_REVERSE_VIEW_H #define _LIBCPP___RANGES_REVERSE_VIEW_H -#include <__concepts/constructible.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/next.h> -#include <__iterator/reverse_iterator.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/non_propagating_cache.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/size.h> -#include <__ranges/subrange.h> -#include <__ranges/view_interface.h> -#include <__type_traits/conditional.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/non_propagating_cache.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/single_view.h b/libcxx/include/__cxx03/__ranges/single_view.h index 45244f34994d7..34054fa0f9d42 100644 --- a/libcxx/include/__cxx03/__ranges/single_view.h +++ b/libcxx/include/__cxx03/__ranges/single_view.h @@ -10,24 +10,24 @@ #ifndef _LIBCPP___RANGES_SINGLE_VIEW_H #define _LIBCPP___RANGES_SINGLE_VIEW_H -#include <__concepts/constructible.h> -#include <__config> -#include <__ranges/movable_box.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/view_interface.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_object.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> -#include +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/size.h b/libcxx/include/__cxx03/__ranges/size.h index 40b0c6b6aad7a..84b165551d921 100644 --- a/libcxx/include/__cxx03/__ranges/size.h +++ b/libcxx/include/__cxx03/__ranges/size.h @@ -10,19 +10,19 @@ #ifndef _LIBCPP___RANGES_SIZE_H #define _LIBCPP___RANGES_SIZE_H -#include <__concepts/arithmetic.h> -#include <__concepts/class_or_enum.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__type_traits/decay.h> -#include <__type_traits/make_signed.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/auto_cast.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__concepts/class_or_enum.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/make_signed.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/auto_cast.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/split_view.h b/libcxx/include/__cxx03/__ranges/split_view.h index ce3606aedfefb..f23e7878e349c 100644 --- a/libcxx/include/__cxx03/__ranges/split_view.h +++ b/libcxx/include/__cxx03/__ranges/split_view.h @@ -10,34 +10,34 @@ #ifndef _LIBCPP___RANGES_SPLIT_VIEW_H #define _LIBCPP___RANGES_SPLIT_VIEW_H -#include <__algorithm/ranges_search.h> -#include <__concepts/constructible.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__functional/ranges_operations.h> -#include <__iterator/indirectly_comparable.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/empty.h> -#include <__ranges/non_propagating_cache.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/single_view.h> -#include <__ranges/subrange.h> -#include <__ranges/view_interface.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__algorithm/ranges_search.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__functional/ranges_operations.h> +#include <__cxx03/__iterator/indirectly_comparable.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/empty.h> +#include <__cxx03/__ranges/non_propagating_cache.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/single_view.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/subrange.h b/libcxx/include/__cxx03/__ranges/subrange.h index aba584ef93354..fa985a5c365af 100644 --- a/libcxx/include/__cxx03/__ranges/subrange.h +++ b/libcxx/include/__cxx03/__ranges/subrange.h @@ -10,43 +10,43 @@ #ifndef _LIBCPP___RANGES_SUBRANGE_H #define _LIBCPP___RANGES_SUBRANGE_H -#include <__assert> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/copyable.h> -#include <__concepts/derived_from.h> -#include <__concepts/different_from.h> -#include <__config> -#include <__fwd/subrange.h> -#include <__iterator/advance.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/dangling.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/size.h> -#include <__ranges/view_interface.h> -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_like_no_subrange.h> -#include <__tuple/tuple_size.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_pointer.h> -#include <__utility/move.h> -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/different_from.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/subrange.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/dangling.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_like_no_subrange.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/take_view.h b/libcxx/include/__cxx03/__ranges/take_view.h index 27ca8155a69b1..e909158f5c417 100644 --- a/libcxx/include/__cxx03/__ranges/take_view.h +++ b/libcxx/include/__cxx03/__ranges/take_view.h @@ -10,46 +10,46 @@ #ifndef _LIBCPP___RANGES_TAKE_VIEW_H #define _LIBCPP___RANGES_TAKE_VIEW_H -#include <__algorithm/min.h> -#include <__algorithm/ranges_min.h> -#include <__assert> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__fwd/span.h> -#include <__fwd/string_view.h> -#include <__iterator/concepts.h> -#include <__iterator/counted_iterator.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/empty_view.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/iota_view.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/repeat_view.h> -#include <__ranges/size.h> -#include <__ranges/subrange.h> -#include <__ranges/view_interface.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/maybe_const.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/auto_cast.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/ranges_min.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__fwd/span.h> +#include <__cxx03/__fwd/string_view.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/counted_iterator.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/empty_view.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/iota_view.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/repeat_view.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/subrange.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/maybe_const.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/auto_cast.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/take_while_view.h b/libcxx/include/__cxx03/__ranges/take_while_view.h index 77ea9f7bb8131..3968777240ed2 100644 --- a/libcxx/include/__cxx03/__ranges/take_while_view.h +++ b/libcxx/include/__cxx03/__ranges/take_while_view.h @@ -10,33 +10,33 @@ #ifndef _LIBCPP___RANGES_TAKE_WHILE_VIEW_H #define _LIBCPP___RANGES_TAKE_WHILE_VIEW_H -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__functional/invoke.h> -#include <__iterator/concepts.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/movable_box.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/view_interface.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_object.h> -#include <__type_traits/maybe_const.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/maybe_const.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/to.h b/libcxx/include/__cxx03/__ranges/to.h index e0abe6290b8f7..682b811946a62 100644 --- a/libcxx/include/__cxx03/__ranges/to.h +++ b/libcxx/include/__cxx03/__ranges/to.h @@ -10,30 +10,30 @@ #ifndef _LIBCPP___RANGES_TO_H #define _LIBCPP___RANGES_TO_H -#include <__algorithm/ranges_copy.h> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/derived_from.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/insert_iterator.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/from_range.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/ref_view.h> -#include <__ranges/size.h> -#include <__ranges/transform_view.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_volatile.h> -#include <__type_traits/type_identity.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/insert_iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/ref_view.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/transform_view.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_volatile.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/transform_view.h b/libcxx/include/__cxx03/__ranges/transform_view.h index bcce389c0e680..78012cde38854 100644 --- a/libcxx/include/__cxx03/__ranges/transform_view.h +++ b/libcxx/include/__cxx03/__ranges/transform_view.h @@ -10,45 +10,45 @@ #ifndef _LIBCPP___RANGES_TRANSFORM_VIEW_H #define _LIBCPP___RANGES_TRANSFORM_VIEW_H -#include <__compare/three_way_comparable.h> -#include <__concepts/constructible.h> -#include <__concepts/convertible_to.h> -#include <__concepts/copyable.h> -#include <__concepts/derived_from.h> -#include <__concepts/equality_comparable.h> -#include <__concepts/invocable.h> -#include <__config> -#include <__functional/bind_back.h> -#include <__functional/invoke.h> -#include <__functional/perfect_forward.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__memory/addressof.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/empty.h> -#include <__ranges/movable_box.h> -#include <__ranges/range_adaptor.h> -#include <__ranges/size.h> -#include <__ranges/view_interface.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/maybe_const.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/copyable.h> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/bind_back.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/perfect_forward.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/empty.h> +#include <__cxx03/__ranges/movable_box.h> +#include <__cxx03/__ranges/range_adaptor.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/maybe_const.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__ranges/view_interface.h b/libcxx/include/__cxx03/__ranges/view_interface.h index 3bcfbaf3a2f9e..23cdcdaaad43c 100644 --- a/libcxx/include/__cxx03/__ranges/view_interface.h +++ b/libcxx/include/__cxx03/__ranges/view_interface.h @@ -10,21 +10,21 @@ #ifndef _LIBCPP___RANGES_VIEW_INTERFACE_H #define _LIBCPP___RANGES_VIEW_INTERFACE_H -#include <__assert> -#include <__concepts/derived_from.h> -#include <__concepts/same_as.h> -#include <__config> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/prev.h> -#include <__memory/pointer_traits.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/empty.h> -#include <__ranges/size.h> -#include <__type_traits/is_class.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__assert> +#include <__cxx03/__concepts/derived_from.h> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/prev.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/empty.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__type_traits/is_class.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/views.h b/libcxx/include/__cxx03/__ranges/views.h index a4de2a5e52a9d..2809271f6e1a9 100644 --- a/libcxx/include/__cxx03/__ranges/views.h +++ b/libcxx/include/__cxx03/__ranges/views.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___RANGES_VIEWS #define _LIBCPP___RANGES_VIEWS -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__ranges/zip_view.h b/libcxx/include/__cxx03/__ranges/zip_view.h index fe3c87a9306fe..4d9f62647c8f2 100644 --- a/libcxx/include/__cxx03/__ranges/zip_view.h +++ b/libcxx/include/__cxx03/__ranges/zip_view.h @@ -10,41 +10,41 @@ #ifndef _LIBCPP___RANGES_ZIP_VIEW_H #define _LIBCPP___RANGES_ZIP_VIEW_H -#include <__config> - -#include <__algorithm/ranges_min.h> -#include <__compare/three_way_comparable.h> -#include <__concepts/convertible_to.h> -#include <__concepts/equality_comparable.h> -#include <__functional/invoke.h> -#include <__functional/operations.h> -#include <__iterator/concepts.h> -#include <__iterator/incrementable_traits.h> -#include <__iterator/iter_move.h> -#include <__iterator/iter_swap.h> -#include <__iterator/iterator_traits.h> -#include <__ranges/access.h> -#include <__ranges/all.h> -#include <__ranges/concepts.h> -#include <__ranges/empty_view.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/size.h> -#include <__ranges/view_interface.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/make_unsigned.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/integer_sequence.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include +#include <__cxx03/__config> + +#include <__cxx03/__algorithm/ranges_min.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/incrementable_traits.h> +#include <__cxx03/__iterator/iter_move.h> +#include <__cxx03/__iterator/iter_swap.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/all.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/empty_view.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__ranges/view_interface.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/tuple> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__split_buffer b/libcxx/include/__cxx03/__split_buffer index bab724d1b8963..133048bfea77b 100644 --- a/libcxx/include/__cxx03/__split_buffer +++ b/libcxx/include/__cxx03/__split_buffer @@ -10,39 +10,39 @@ #ifndef _LIBCPP___SPLIT_BUFFER #define _LIBCPP___SPLIT_BUFFER -#include <__algorithm/max.h> -#include <__algorithm/move.h> -#include <__algorithm/move_backward.h> -#include <__config> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/move_iterator.h> -#include <__memory/allocate_at_least.h> -#include <__memory/allocator.h> -#include <__memory/allocator_traits.h> -#include <__memory/compressed_pair.h> -#include <__memory/pointer_traits.h> -#include <__memory/swap_allocator.h> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/conditional.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/is_trivially_destructible.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/remove_reference.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__algorithm/move.h> +#include <__cxx03/__algorithm/move_backward.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/move_iterator.h> +#include <__cxx03/__memory/allocate_at_least.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/swap_allocator.h> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/is_trivially_destructible.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__std_clang_module b/libcxx/include/__cxx03/__std_clang_module index 18d6ce6b46c1f..dc542e6fae0c8 100644 --- a/libcxx/include/__cxx03/__std_clang_module +++ b/libcxx/include/__cxx03/__std_clang_module @@ -21,195 +21,195 @@ # error "Do not include this header directly, include individual headers instead" #endif -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif -#include -#include -#include +#include <__cxx03/algorithm> +#include <__cxx03/any> +#include <__cxx03/array> #if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) -# include +# include <__cxx03/atomic> #endif #if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +# include <__cxx03/barrier> +#endif +#include <__cxx03/bit> +#include <__cxx03/bitset> +#include <__cxx03/cassert> +#include <__cxx03/ccomplex> +#include <__cxx03/cctype> +#include <__cxx03/cerrno> +#include <__cxx03/cfenv> +#include <__cxx03/cfloat> +#include <__cxx03/charconv> +#include <__cxx03/chrono> +#include <__cxx03/cinttypes> +#include <__cxx03/ciso646> +#include <__cxx03/climits> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/clocale> #endif -#include +#include <__cxx03/cmath> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +# include <__cxx03/codecvt> +#endif +#include <__cxx03/compare> +#include <__cxx03/complex.h> +#include <__cxx03/complex> +#include <__cxx03/concepts> +#include <__cxx03/condition_variable> +#include <__cxx03/coroutine> +#include <__cxx03/csetjmp> +#include <__cxx03/csignal> +#include <__cxx03/cstdarg> +#include <__cxx03/cstdbool> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/cstdio> +#include <__cxx03/cstdlib> +#include <__cxx03/cstring> +#include <__cxx03/ctgmath> +#include <__cxx03/ctime> +#include <__cxx03/ctype.h> +#include <__cxx03/cuchar> #if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) -# include +# include <__cxx03/cwchar> #endif #if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +# include <__cxx03/cwctype> +#endif +#include <__cxx03/deque> +#include <__cxx03/errno.h> +#include <__cxx03/exception> +#include <__cxx03/execution> +#include <__cxx03/expected> +#include <__cxx03/experimental/iterator> +#include <__cxx03/experimental/memory> +#include <__cxx03/experimental/propagate_const> +#include <__cxx03/experimental/simd> +#include <__cxx03/experimental/type_traits> +#include <__cxx03/experimental/utility> +#include <__cxx03/fenv.h> +#include <__cxx03/filesystem> +#include <__cxx03/float.h> +#include <__cxx03/format> +#include <__cxx03/forward_list> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/fstream> #endif -#include +#include <__cxx03/functional> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include +# include <__cxx03/future> #endif -#include -#include +#include <__cxx03/initializer_list> +#include <__cxx03/inttypes.h> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/iomanip> #endif #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/ios> #endif -#include +#include <__cxx03/iosfwd> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/iostream> #endif #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/istream> #endif -#include +#include <__cxx03/iterator> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include +# include <__cxx03/latch> #endif -#include -#include +#include <__cxx03/limits> +#include <__cxx03/list> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/locale.h> #endif #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +# include <__cxx03/locale> +#endif +#include <__cxx03/map> +#include <__cxx03/math.h> +#include <__cxx03/mdspan> +#include <__cxx03/memory> +#include <__cxx03/memory_resource> +#include <__cxx03/mutex> +#include <__cxx03/new> +#include <__cxx03/numbers> +#include <__cxx03/numeric> +#include <__cxx03/optional> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/ostream> #endif -#include -#include -#include -#include -#include +#include <__cxx03/print> +#include <__cxx03/queue> +#include <__cxx03/random> +#include <__cxx03/ranges> +#include <__cxx03/ratio> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/regex> #endif -#include +#include <__cxx03/scoped_allocator> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include +# include <__cxx03/semaphore> #endif -#include +#include <__cxx03/set> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include +# include <__cxx03/shared_mutex> #endif -#include -#include +#include <__cxx03/source_location> +#include <__cxx03/span> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/sstream> #endif -#include +#include <__cxx03/stack> #if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) -# include -#endif -#include -#include -#include -#include -#include -#include +# include <__cxx03/stdatomic.h> +#endif +#include <__cxx03/stdbool.h> +#include <__cxx03/stddef.h> +#include <__cxx03/stdexcept> +#include <__cxx03/stdint.h> +#include <__cxx03/stdio.h> +#include <__cxx03/stdlib.h> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include +# include <__cxx03/stop_token> #endif #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/streambuf> #endif -#include -#include -#include +#include <__cxx03/string.h> +#include <__cxx03/string> +#include <__cxx03/string_view> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/strstream> #endif #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/syncstream> #endif -#include -#include +#include <__cxx03/system_error> +#include <__cxx03/tgmath.h> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +# include <__cxx03/thread> +#endif +#include <__cxx03/tuple> +#include <__cxx03/type_traits> +#include <__cxx03/typeindex> +#include <__cxx03/typeinfo> +#include <__cxx03/uchar.h> +#include <__cxx03/unordered_map> +#include <__cxx03/unordered_set> +#include <__cxx03/utility> +#include <__cxx03/valarray> +#include <__cxx03/variant> +#include <__cxx03/vector> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) -# include +# include <__cxx03/wchar.h> #endif #if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) -# include +# include <__cxx03/wctype.h> #endif diff --git a/libcxx/include/__cxx03/__std_mbstate_t.h b/libcxx/include/__cxx03/__std_mbstate_t.h index e79cc789fddf9..ac28555214188 100644 --- a/libcxx/include/__cxx03/__std_mbstate_t.h +++ b/libcxx/include/__cxx03/__std_mbstate_t.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___STD_MBSTATE_T_H #define _LIBCPP___STD_MBSTATE_T_H -#include <__config> -#include <__mbstate_t.h> +#include <__cxx03/__config> +#include <__cxx03/__mbstate_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__stop_token/atomic_unique_lock.h b/libcxx/include/__cxx03/__stop_token/atomic_unique_lock.h index 13e59f9f0dce0..a8e4d2a90a53b 100644 --- a/libcxx/include/__cxx03/__stop_token/atomic_unique_lock.h +++ b/libcxx/include/__cxx03/__stop_token/atomic_unique_lock.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___STOP_TOKEN_ATOMIC_UNIQUE_GUARD_H #define _LIBCPP___STOP_TOKEN_ATOMIC_UNIQUE_GUARD_H -#include <__bit/popcount.h> -#include <__config> -#include +#include <__cxx03/__bit/popcount.h> +#include <__cxx03/__config> +#include <__cxx03/atomic> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__stop_token/intrusive_list_view.h b/libcxx/include/__cxx03/__stop_token/intrusive_list_view.h index 11a3e267e7c6d..ab11762802ac4 100644 --- a/libcxx/include/__cxx03/__stop_token/intrusive_list_view.h +++ b/libcxx/include/__cxx03/__stop_token/intrusive_list_view.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___STOP_TOKEN_INTRUSIVE_LIST_VIEW_H #define _LIBCPP___STOP_TOKEN_INTRUSIVE_LIST_VIEW_H -#include <__assert> -#include <__config> +#include <__cxx03/__assert> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__stop_token/intrusive_shared_ptr.h b/libcxx/include/__cxx03/__stop_token/intrusive_shared_ptr.h index f00cea5bc2b67..62eb946a5bbed 100644 --- a/libcxx/include/__cxx03/__stop_token/intrusive_shared_ptr.h +++ b/libcxx/include/__cxx03/__stop_token/intrusive_shared_ptr.h @@ -10,20 +10,20 @@ #ifndef _LIBCPP___STOP_TOKEN_INTRUSIVE_SHARED_PTR_H #define _LIBCPP___STOP_TOKEN_INTRUSIVE_SHARED_PTR_H -#include <__atomic/atomic.h> -#include <__atomic/memory_order.h> -#include <__config> -#include <__type_traits/is_reference.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include +#include <__cxx03/__atomic/atomic.h> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__stop_token/stop_callback.h b/libcxx/include/__cxx03/__stop_token/stop_callback.h index 760cf2bb55b0c..afd194cca1d00 100644 --- a/libcxx/include/__cxx03/__stop_token/stop_callback.h +++ b/libcxx/include/__cxx03/__stop_token/stop_callback.h @@ -10,24 +10,24 @@ #ifndef _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H #define _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H -#include <__concepts/constructible.h> -#include <__concepts/destructible.h> -#include <__concepts/invocable.h> -#include <__config> -#include <__stop_token/intrusive_shared_ptr.h> -#include <__stop_token/stop_state.h> -#include <__stop_token/stop_token.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/private_constructor_tag.h> +#include <__cxx03/__concepts/constructible.h> +#include <__cxx03/__concepts/destructible.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__config> +#include <__cxx03/__stop_token/intrusive_shared_ptr.h> +#include <__cxx03/__stop_token/stop_state.h> +#include <__cxx03/__stop_token/stop_token.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/private_constructor_tag.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__stop_token/stop_source.h b/libcxx/include/__cxx03/__stop_token/stop_source.h index 70697462784ab..0d8b714bfba4e 100644 --- a/libcxx/include/__cxx03/__stop_token/stop_source.h +++ b/libcxx/include/__cxx03/__stop_token/stop_source.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___STOP_TOKEN_STOP_SOURCE_H #define _LIBCPP___STOP_TOKEN_STOP_SOURCE_H -#include <__config> -#include <__stop_token/intrusive_shared_ptr.h> -#include <__stop_token/stop_state.h> -#include <__stop_token/stop_token.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__stop_token/intrusive_shared_ptr.h> +#include <__cxx03/__stop_token/stop_state.h> +#include <__cxx03/__stop_token/stop_token.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__stop_token/stop_state.h b/libcxx/include/__cxx03/__stop_token/stop_state.h index b0eed13a143cf..39688917d1d47 100644 --- a/libcxx/include/__cxx03/__stop_token/stop_state.h +++ b/libcxx/include/__cxx03/__stop_token/stop_state.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP___STOP_TOKEN_STOP_STATE_H #define _LIBCPP___STOP_TOKEN_STOP_STATE_H -#include <__assert> -#include <__config> -#include <__stop_token/atomic_unique_lock.h> -#include <__stop_token/intrusive_list_view.h> -#include <__thread/id.h> -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__stop_token/atomic_unique_lock.h> +#include <__cxx03/__stop_token/intrusive_list_view.h> +#include <__cxx03/__thread/id.h> +#include <__cxx03/atomic> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__stop_token/stop_token.h b/libcxx/include/__cxx03/__stop_token/stop_token.h index 1bd75cbbf6f8d..bba5cd1c4796e 100644 --- a/libcxx/include/__cxx03/__stop_token/stop_token.h +++ b/libcxx/include/__cxx03/__stop_token/stop_token.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___STOP_TOKEN_STOP_TOKEN_H #define _LIBCPP___STOP_TOKEN_STOP_TOKEN_H -#include <__config> -#include <__stop_token/intrusive_shared_ptr.h> -#include <__stop_token/stop_state.h> +#include <__cxx03/__config> +#include <__cxx03/__stop_token/intrusive_shared_ptr.h> +#include <__cxx03/__stop_token/stop_state.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__string/char_traits.h b/libcxx/include/__cxx03/__string/char_traits.h index 2660ac2ede2d5..69078d6615ea4 100644 --- a/libcxx/include/__cxx03/__string/char_traits.h +++ b/libcxx/include/__cxx03/__string/char_traits.h @@ -9,27 +9,27 @@ #ifndef _LIBCPP___STRING_CHAR_TRAITS_H #define _LIBCPP___STRING_CHAR_TRAITS_H -#include <__algorithm/fill_n.h> -#include <__algorithm/find.h> -#include <__algorithm/find_end.h> -#include <__algorithm/find_first_of.h> -#include <__algorithm/min.h> -#include <__assert> -#include <__compare/ordering.h> -#include <__config> -#include <__functional/hash.h> -#include <__functional/identity.h> -#include <__iterator/iterator_traits.h> -#include <__string/constexpr_c_functions.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__utility/is_pointer_in_range.h> -#include -#include -#include -#include +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__algorithm/find.h> +#include <__cxx03/__algorithm/find_end.h> +#include <__cxx03/__algorithm/find_first_of.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/identity.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__string/constexpr_c_functions.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__utility/is_pointer_in_range.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/cstdio> +#include <__cxx03/iosfwd> #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS -# include // for wmemcpy +# include <__cxx03/cwchar> // for wmemcpy #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -37,7 +37,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__string/constexpr_c_functions.h b/libcxx/include/__cxx03/__string/constexpr_c_functions.h index a978f816f1897..b1e269d974087 100644 --- a/libcxx/include/__cxx03/__string/constexpr_c_functions.h +++ b/libcxx/include/__cxx03/__string/constexpr_c_functions.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___STRING_CONSTEXPR_C_FUNCTIONS_H #define _LIBCPP___STRING_CONSTEXPR_C_FUNCTIONS_H -#include <__config> -#include <__memory/addressof.h> -#include <__memory/construct_at.h> -#include <__type_traits/datasizeof.h> -#include <__type_traits/is_always_bitcastable.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_equality_comparable.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_trivially_copyable.h> -#include <__type_traits/is_trivially_lexicographically_comparable.h> -#include <__type_traits/remove_cv.h> -#include <__utility/is_pointer_in_range.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__type_traits/datasizeof.h> +#include <__cxx03/__type_traits/is_always_bitcastable.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_equality_comparable.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/__type_traits/is_trivially_lexicographically_comparable.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__utility/is_pointer_in_range.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__string/extern_template_lists.h b/libcxx/include/__cxx03/__string/extern_template_lists.h index cc536e514d4ff..09b3375aeecf6 100644 --- a/libcxx/include/__cxx03/__string/extern_template_lists.h +++ b/libcxx/include/__cxx03/__string/extern_template_lists.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___STRING_EXTERN_TEMPLATE_LISTS_H #define _LIBCPP___STRING_EXTERN_TEMPLATE_LISTS_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__support/ibm/gettod_zos.h b/libcxx/include/__cxx03/__support/ibm/gettod_zos.h index bd7e467736697..18797cc18e614 100644 --- a/libcxx/include/__cxx03/__support/ibm/gettod_zos.h +++ b/libcxx/include/__cxx03/__support/ibm/gettod_zos.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___SUPPORT_IBM_GETTOD_ZOS_H #define _LIBCPP___SUPPORT_IBM_GETTOD_ZOS_H -#include +#include <__cxx03/time.h> inline _LIBCPP_HIDE_FROM_ABI int gettimeofdayMonotonic(struct timespec64* Output) { // The POSIX gettimeofday() function is not available on z/OS. Therefore, diff --git a/libcxx/include/__cxx03/__support/ibm/locale_mgmt_zos.h b/libcxx/include/__cxx03/__support/ibm/locale_mgmt_zos.h index 5fc04b6b4b299..ec88704ba8604 100644 --- a/libcxx/include/__cxx03/__support/ibm/locale_mgmt_zos.h +++ b/libcxx/include/__cxx03/__support/ibm/locale_mgmt_zos.h @@ -11,8 +11,8 @@ #define _LIBCPP___SUPPORT_IBM_LOCALE_MGMT_ZOS_H #if defined(__MVS__) -# include -# include +# include <__cxx03/locale.h> +# include <__cxx03/string> # ifdef __cplusplus extern "C" { diff --git a/libcxx/include/__cxx03/__support/ibm/nanosleep.h b/libcxx/include/__cxx03/__support/ibm/nanosleep.h index fadc784c0297c..ecf7187543f36 100644 --- a/libcxx/include/__cxx03/__support/ibm/nanosleep.h +++ b/libcxx/include/__cxx03/__support/ibm/nanosleep.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___SUPPORT_IBM_NANOSLEEP_H #define _LIBCPP___SUPPORT_IBM_NANOSLEEP_H -#include +#include <__cxx03/unistd.h> inline int nanosleep(const struct timespec* __req, struct timespec* __rem) { // The nanosleep() function is not available on z/OS. Therefore, we will call diff --git a/libcxx/include/__cxx03/__support/xlocale/__nop_locale_mgmt.h b/libcxx/include/__cxx03/__support/xlocale/__nop_locale_mgmt.h index b9ffcbe1622d5..036bd3b183506 100644 --- a/libcxx/include/__cxx03/__support/xlocale/__nop_locale_mgmt.h +++ b/libcxx/include/__cxx03/__support/xlocale/__nop_locale_mgmt.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___SUPPORT_XLOCALE_NOP_LOCALE_MGMT_H #define _LIBCPP___SUPPORT_XLOCALE_NOP_LOCALE_MGMT_H -#include <__config> +#include <__cxx03/__config> // Patch over lack of extended locale support typedef void* locale_t; diff --git a/libcxx/include/__cxx03/__support/xlocale/__posix_l_fallback.h b/libcxx/include/__cxx03/__support/xlocale/__posix_l_fallback.h index 8a3a6f27f48dd..8651c2b7ca958 100644 --- a/libcxx/include/__cxx03/__support/xlocale/__posix_l_fallback.h +++ b/libcxx/include/__cxx03/__support/xlocale/__posix_l_fallback.h @@ -15,14 +15,14 @@ #ifndef _LIBCPP___SUPPORT_XLOCALE_POSIX_L_FALLBACK_H #define _LIBCPP___SUPPORT_XLOCALE_POSIX_L_FALLBACK_H -#include <__config> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/ctype.h> +#include <__cxx03/string.h> +#include <__cxx03/time.h> #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS -# include -# include +# include <__cxx03/wchar.h> +# include <__cxx03/wctype.h> #endif inline _LIBCPP_HIDE_FROM_ABI int isalnum_l(int __c, locale_t) { return ::isalnum(__c); } diff --git a/libcxx/include/__cxx03/__support/xlocale/__strtonum_fallback.h b/libcxx/include/__cxx03/__support/xlocale/__strtonum_fallback.h index b7eef5210ed37..3587e2f1a3219 100644 --- a/libcxx/include/__cxx03/__support/xlocale/__strtonum_fallback.h +++ b/libcxx/include/__cxx03/__support/xlocale/__strtonum_fallback.h @@ -15,11 +15,11 @@ #ifndef _LIBCPP___SUPPORT_XLOCALE_STRTONUM_FALLBACK_H #define _LIBCPP___SUPPORT_XLOCALE_STRTONUM_FALLBACK_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/stdlib.h> #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS -# include +# include <__cxx03/wchar.h> #endif inline _LIBCPP_HIDE_FROM_ABI float strtof_l(const char* __nptr, char** __endptr, locale_t) { diff --git a/libcxx/include/__cxx03/__system_error/errc.h b/libcxx/include/__cxx03/__system_error/errc.h index 0004c46e2279b..33e670ecd3a70 100644 --- a/libcxx/include/__cxx03/__system_error/errc.h +++ b/libcxx/include/__cxx03/__system_error/errc.h @@ -100,8 +100,8 @@ enum class errc */ -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cerrno> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__system_error/error_category.h b/libcxx/include/__cxx03/__system_error/error_category.h index bfe7bc24a5d3d..cc08cf2801d43 100644 --- a/libcxx/include/__cxx03/__system_error/error_category.h +++ b/libcxx/include/__cxx03/__system_error/error_category.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___SYSTEM_ERROR_ERROR_CATEGORY_H #define _LIBCPP___SYSTEM_ERROR_ERROR_CATEGORY_H -#include <__compare/ordering.h> -#include <__config> -#include +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/string> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__system_error/error_code.h b/libcxx/include/__cxx03/__system_error/error_code.h index 475f2bb96a56d..2fe4d299885fc 100644 --- a/libcxx/include/__cxx03/__system_error/error_code.h +++ b/libcxx/include/__cxx03/__system_error/error_code.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP___SYSTEM_ERROR_ERROR_CODE_H #define _LIBCPP___SYSTEM_ERROR_ERROR_CODE_H -#include <__compare/ordering.h> -#include <__config> -#include <__functional/hash.h> -#include <__functional/unary_function.h> -#include <__system_error/errc.h> -#include <__system_error/error_category.h> -#include <__system_error/error_condition.h> -#include -#include +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__system_error/error_category.h> +#include <__cxx03/__system_error/error_condition.h> +#include <__cxx03/cstddef> +#include <__cxx03/string> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__system_error/error_condition.h b/libcxx/include/__cxx03/__system_error/error_condition.h index 42898c1f0e901..87bfcd6c6350c 100644 --- a/libcxx/include/__cxx03/__system_error/error_condition.h +++ b/libcxx/include/__cxx03/__system_error/error_condition.h @@ -10,14 +10,14 @@ #ifndef _LIBCPP___SYSTEM_ERROR_ERROR_CONDITION_H #define _LIBCPP___SYSTEM_ERROR_ERROR_CONDITION_H -#include <__compare/ordering.h> -#include <__config> -#include <__functional/hash.h> -#include <__functional/unary_function.h> -#include <__system_error/errc.h> -#include <__system_error/error_category.h> -#include -#include +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__system_error/error_category.h> +#include <__cxx03/cstddef> +#include <__cxx03/string> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__system_error/system_error.h b/libcxx/include/__cxx03/__system_error/system_error.h index 362e67505658c..2487d3bafa06d 100644 --- a/libcxx/include/__cxx03/__system_error/system_error.h +++ b/libcxx/include/__cxx03/__system_error/system_error.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___SYSTEM_ERROR_SYSTEM_ERROR_H #define _LIBCPP___SYSTEM_ERROR_SYSTEM_ERROR_H -#include <__config> -#include <__system_error/error_category.h> -#include <__system_error/error_code.h> -#include <__verbose_abort> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__system_error/error_category.h> +#include <__cxx03/__system_error/error_code.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/stdexcept> +#include <__cxx03/string> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__thread/formatter.h b/libcxx/include/__cxx03/__thread/formatter.h index 9b54036dcab36..d7ecba57203ed 100644 --- a/libcxx/include/__cxx03/__thread/formatter.h +++ b/libcxx/include/__cxx03/__thread/formatter.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___THREAD_FORMATTER_H #define _LIBCPP___THREAD_FORMATTER_H -#include <__concepts/arithmetic.h> -#include <__config> -#include <__format/concepts.h> -#include <__format/format_parse_context.h> -#include <__format/formatter.h> -#include <__format/formatter_integral.h> -#include <__format/parser_std_format_spec.h> -#include <__thread/id.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_same.h> -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__format/concepts.h> +#include <__cxx03/__format/format_parse_context.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_integral.h> +#include <__cxx03/__format/parser_std_format_spec.h> +#include <__cxx03/__thread/id.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__thread/id.h b/libcxx/include/__cxx03/__thread/id.h index 6db0ccbfe569b..09e7bb071c61f 100644 --- a/libcxx/include/__cxx03/__thread/id.h +++ b/libcxx/include/__cxx03/__thread/id.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___THREAD_ID_H #define _LIBCPP___THREAD_ID_H -#include <__compare/ordering.h> -#include <__config> -#include <__fwd/functional.h> -#include <__fwd/ostream.h> -#include <__thread/support.h> +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__fwd/ostream.h> +#include <__cxx03/__thread/support.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__thread/jthread.h b/libcxx/include/__cxx03/__thread/jthread.h index b3d5c25fb71c7..840d0cae9d0a2 100644 --- a/libcxx/include/__cxx03/__thread/jthread.h +++ b/libcxx/include/__cxx03/__thread/jthread.h @@ -10,25 +10,25 @@ #ifndef _LIBCPP___THREAD_JTHREAD_H #define _LIBCPP___THREAD_JTHREAD_H -#include <__config> -#include <__functional/invoke.h> -#include <__stop_token/stop_source.h> -#include <__stop_token/stop_token.h> -#include <__thread/support.h> -#include <__thread/thread.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__stop_token/stop_source.h> +#include <__cxx03/__stop_token/stop_token.h> +#include <__cxx03/__thread/support.h> +#include <__cxx03/__thread/thread.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN) diff --git a/libcxx/include/__cxx03/__thread/poll_with_backoff.h b/libcxx/include/__cxx03/__thread/poll_with_backoff.h index 4f961fe3f7629..d875c1465dc81 100644 --- a/libcxx/include/__cxx03/__thread/poll_with_backoff.h +++ b/libcxx/include/__cxx03/__thread/poll_with_backoff.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___THREAD_POLL_WITH_BACKOFF_H #define _LIBCPP___THREAD_POLL_WITH_BACKOFF_H -#include <__chrono/duration.h> -#include <__chrono/high_resolution_clock.h> -#include <__config> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/high_resolution_clock.h> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__thread/support.h b/libcxx/include/__cxx03/__thread/support.h index 92f1c4415e4df..3ef25c88c1a8a 100644 --- a/libcxx/include/__cxx03/__thread/support.h +++ b/libcxx/include/__cxx03/__thread/support.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP___THREAD_SUPPORT_H #define _LIBCPP___THREAD_SUPPORT_H -#include <__config> +#include <__cxx03/__config> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header @@ -107,13 +107,13 @@ _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_HAS_NO_THREADS) # if defined(_LIBCPP_HAS_THREAD_API_EXTERNAL) -# include <__thread/support/external.h> +# include <__cxx03/__thread/support/external.h> # elif defined(_LIBCPP_HAS_THREAD_API_PTHREAD) -# include <__thread/support/pthread.h> +# include <__cxx03/__thread/support/pthread.h> # elif defined(_LIBCPP_HAS_THREAD_API_C11) -# include <__thread/support/c11.h> +# include <__cxx03/__thread/support/c11.h> # elif defined(_LIBCPP_HAS_THREAD_API_WIN32) -# include <__thread/support/windows.h> +# include <__cxx03/__thread/support/windows.h> # else # error "No threading API was selected" # endif diff --git a/libcxx/include/__cxx03/__thread/support/c11.h b/libcxx/include/__cxx03/__thread/support/c11.h index fe00a2d97fadc..7b791388029b9 100644 --- a/libcxx/include/__cxx03/__thread/support/c11.h +++ b/libcxx/include/__cxx03/__thread/support/c11.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___THREAD_SUPPORT_C11_H #define _LIBCPP___THREAD_SUPPORT_C11_H -#include <__chrono/convert_to_timespec.h> -#include <__chrono/duration.h> -#include <__config> -#include -#include -#include +#include <__cxx03/__chrono/convert_to_timespec.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/ctime> +#include <__cxx03/errno.h> +#include <__cxx03/threads.h> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__thread/support/external.h b/libcxx/include/__cxx03/__thread/support/external.h index d5e212491cfdb..d4b476c435049 100644 --- a/libcxx/include/__cxx03/__thread/support/external.h +++ b/libcxx/include/__cxx03/__thread/support/external.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___THREAD_SUPPORT_EXTERNAL_H #define _LIBCPP___THREAD_SUPPORT_EXTERNAL_H -#include <__config> +#include <__cxx03/__config> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header #endif -#include <__external_threading> +#include <__cxx03/__external_threading> #endif // _LIBCPP___THREAD_SUPPORT_EXTERNAL_H diff --git a/libcxx/include/__cxx03/__thread/support/pthread.h b/libcxx/include/__cxx03/__thread/support/pthread.h index 531f3e71de839..9c3646896c108 100644 --- a/libcxx/include/__cxx03/__thread/support/pthread.h +++ b/libcxx/include/__cxx03/__thread/support/pthread.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP___THREAD_SUPPORT_PTHREAD_H #define _LIBCPP___THREAD_SUPPORT_PTHREAD_H -#include <__chrono/convert_to_timespec.h> -#include <__chrono/duration.h> -#include <__config> -#include -#include -#include -#include +#include <__cxx03/__chrono/convert_to_timespec.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/ctime> +#include <__cxx03/errno.h> +#include <__cxx03/pthread.h> +#include <__cxx03/sched.h> #ifdef __MVS__ -# include <__support/ibm/nanosleep.h> +# include <__cxx03/__support/ibm/nanosleep.h> #endif // Some platforms require in order for @@ -30,7 +30,7 @@ // Include here to work around that. // This checks wheter a Clang module is built #if __building_module(std) -# include +# include <__cxx03/math.h> #endif #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER diff --git a/libcxx/include/__cxx03/__thread/support/windows.h b/libcxx/include/__cxx03/__thread/support/windows.h index 5dc4fa14f45b6..b7b859b060efa 100644 --- a/libcxx/include/__cxx03/__thread/support/windows.h +++ b/libcxx/include/__cxx03/__thread/support/windows.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___THREAD_SUPPORT_WINDOWS_H #define _LIBCPP___THREAD_SUPPORT_WINDOWS_H -#include <__chrono/duration.h> -#include <__config> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__config> +#include <__cxx03/ctime> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__thread/this_thread.h b/libcxx/include/__cxx03/__thread/this_thread.h index de7eea282c874..fe6c19aaf9b45 100644 --- a/libcxx/include/__cxx03/__thread/this_thread.h +++ b/libcxx/include/__cxx03/__thread/this_thread.h @@ -10,20 +10,20 @@ #ifndef _LIBCPP___THREAD_THIS_THREAD_H #define _LIBCPP___THREAD_THIS_THREAD_H -#include <__chrono/steady_clock.h> -#include <__chrono/time_point.h> -#include <__condition_variable/condition_variable.h> -#include <__config> -#include <__mutex/mutex.h> -#include <__mutex/unique_lock.h> -#include <__thread/support.h> +#include <__cxx03/__chrono/steady_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__condition_variable/condition_variable.h> +#include <__cxx03/__config> +#include <__cxx03/__mutex/mutex.h> +#include <__cxx03/__mutex/unique_lock.h> +#include <__cxx03/__thread/support.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__thread/thread.h b/libcxx/include/__cxx03/__thread/thread.h index d2254a695f5e8..fdd875e03cc68 100644 --- a/libcxx/include/__cxx03/__thread/thread.h +++ b/libcxx/include/__cxx03/__thread/thread.h @@ -10,22 +10,22 @@ #ifndef _LIBCPP___THREAD_THREAD_H #define _LIBCPP___THREAD_THREAD_H -#include <__condition_variable/condition_variable.h> -#include <__config> -#include <__exception/terminate.h> -#include <__functional/hash.h> -#include <__functional/unary_function.h> -#include <__memory/unique_ptr.h> -#include <__mutex/mutex.h> -#include <__system_error/system_error.h> -#include <__thread/id.h> -#include <__thread/support.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__condition_variable/condition_variable.h> +#include <__cxx03/__config> +#include <__cxx03/__exception/terminate.h> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__mutex/mutex.h> +#include <__cxx03/__system_error/system_error.h> +#include <__cxx03/__thread/id.h> +#include <__cxx03/__thread/support.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/tuple> #ifndef _LIBCPP_HAS_NO_LOCALIZATION -# include -# include +# include <__cxx03/locale> +# include <__cxx03/sstream> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -33,7 +33,7 @@ #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__thread/timed_backoff_policy.h b/libcxx/include/__cxx03/__thread/timed_backoff_policy.h index 838c918a57ef0..eec6f225d717d 100644 --- a/libcxx/include/__cxx03/__thread/timed_backoff_policy.h +++ b/libcxx/include/__cxx03/__thread/timed_backoff_policy.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP___THREAD_TIMED_BACKOFF_POLICY_H #define _LIBCPP___THREAD_TIMED_BACKOFF_POLICY_H -#include <__config> +#include <__cxx03/__config> #ifndef _LIBCPP_HAS_NO_THREADS -# include <__chrono/duration.h> -# include <__thread/support.h> +# include <__cxx03/__chrono/duration.h> +# include <__cxx03/__thread/support.h> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tree b/libcxx/include/__cxx03/__tree index 1990fa602d39c..2b8521a091536 100644 --- a/libcxx/include/__cxx03/__tree +++ b/libcxx/include/__cxx03/__tree @@ -10,42 +10,42 @@ #ifndef _LIBCPP___TREE #define _LIBCPP___TREE -#include <__algorithm/min.h> -#include <__assert> -#include <__config> -#include <__functional/invoke.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__memory/addressof.h> -#include <__memory/allocator_traits.h> -#include <__memory/compressed_pair.h> -#include <__memory/pointer_traits.h> -#include <__memory/swap_allocator.h> -#include <__memory/unique_ptr.h> -#include <__type_traits/can_extract_key.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/remove_const_ref.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include <__utility/swap.h> -#include +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/swap_allocator.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__type_traits/can_extract_key.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/remove_const_ref.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__tuple/find_index.h b/libcxx/include/__cxx03/__tuple/find_index.h index 133b00419d0c6..899e7f2d7b41f 100644 --- a/libcxx/include/__cxx03/__tuple/find_index.h +++ b/libcxx/include/__cxx03/__tuple/find_index.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TUPLE_FIND_INDEX_H #define _LIBCPP___TUPLE_FIND_INDEX_H -#include <__config> -#include <__type_traits/is_same.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/ignore.h b/libcxx/include/__cxx03/__tuple/ignore.h index 43cce5387411b..2d1800ae0d708 100644 --- a/libcxx/include/__cxx03/__tuple/ignore.h +++ b/libcxx/include/__cxx03/__tuple/ignore.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TUPLE_IGNORE_H #define _LIBCPP___TUPLE_IGNORE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/make_tuple_types.h b/libcxx/include/__cxx03/__tuple/make_tuple_types.h index 9e0fefae2f2f5..b7abfa74f6f11 100644 --- a/libcxx/include/__cxx03/__tuple/make_tuple_types.h +++ b/libcxx/include/__cxx03/__tuple/make_tuple_types.h @@ -9,17 +9,17 @@ #ifndef _LIBCPP___TUPLE_MAKE_TUPLE_TYPES_H #define _LIBCPP___TUPLE_MAKE_TUPLE_TYPES_H -#include <__config> -#include <__fwd/array.h> -#include <__fwd/tuple.h> -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_indices.h> -#include <__tuple/tuple_size.h> -#include <__tuple/tuple_types.h> -#include <__type_traits/copy_cvref.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_reference.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/array.h> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_indices.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__tuple/tuple_types.h> +#include <__cxx03/__type_traits/copy_cvref.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/sfinae_helpers.h b/libcxx/include/__cxx03/__tuple/sfinae_helpers.h index c7145e0b011a9..d6889364a9162 100644 --- a/libcxx/include/__cxx03/__tuple/sfinae_helpers.h +++ b/libcxx/include/__cxx03/__tuple/sfinae_helpers.h @@ -9,21 +9,21 @@ #ifndef _LIBCPP___TUPLE_SFINAE_HELPERS_H #define _LIBCPP___TUPLE_SFINAE_HELPERS_H -#include <__config> -#include <__fwd/tuple.h> -#include <__tuple/make_tuple_types.h> -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_like_ext.h> -#include <__tuple/tuple_size.h> -#include <__tuple/tuple_types.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/make_tuple_types.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_like_ext.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__tuple/tuple_types.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/tuple_element.h b/libcxx/include/__cxx03/__tuple/tuple_element.h index 9127c47dc8f1a..d00879a069423 100644 --- a/libcxx/include/__cxx03/__tuple/tuple_element.h +++ b/libcxx/include/__cxx03/__tuple/tuple_element.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TUPLE_TUPLE_ELEMENT_H #define _LIBCPP___TUPLE_TUPLE_ELEMENT_H -#include <__config> -#include <__tuple/tuple_indices.h> -#include <__tuple/tuple_types.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__tuple/tuple_indices.h> +#include <__cxx03/__tuple/tuple_types.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/tuple_indices.h b/libcxx/include/__cxx03/__tuple/tuple_indices.h index 501e711255ec1..6e51b69e906b0 100644 --- a/libcxx/include/__cxx03/__tuple/tuple_indices.h +++ b/libcxx/include/__cxx03/__tuple/tuple_indices.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TUPLE_MAKE_TUPLE_INDICES_H #define _LIBCPP___TUPLE_MAKE_TUPLE_INDICES_H -#include <__config> -#include <__utility/integer_sequence.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/tuple_like.h b/libcxx/include/__cxx03/__tuple/tuple_like.h index c080a3dcf1e25..23f42d58312a6 100644 --- a/libcxx/include/__cxx03/__tuple/tuple_like.h +++ b/libcxx/include/__cxx03/__tuple/tuple_like.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___TUPLE_TUPLE_LIKE_H #define _LIBCPP___TUPLE_TUPLE_LIKE_H -#include <__config> -#include <__fwd/subrange.h> -#include <__tuple/tuple_like_no_subrange.h> -#include <__tuple/tuple_size.h> -#include <__type_traits/remove_cvref.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/subrange.h> +#include <__cxx03/__tuple/tuple_like_no_subrange.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__type_traits/remove_cvref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/tuple_like_ext.h b/libcxx/include/__cxx03/__tuple/tuple_like_ext.h index 0cc21e0b75fd1..5bb07cc8b5e74 100644 --- a/libcxx/include/__cxx03/__tuple/tuple_like_ext.h +++ b/libcxx/include/__cxx03/__tuple/tuple_like_ext.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TUPLE_TUPLE_LIKE_EXT_H #define _LIBCPP___TUPLE_TUPLE_LIKE_EXT_H -#include <__config> -#include <__fwd/array.h> -#include <__fwd/pair.h> -#include <__fwd/tuple.h> -#include <__tuple/tuple_types.h> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/array.h> +#include <__cxx03/__fwd/pair.h> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/tuple_types.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/tuple_like_no_subrange.h b/libcxx/include/__cxx03/__tuple/tuple_like_no_subrange.h index 274b0bf188e1f..af4a14d2d535f 100644 --- a/libcxx/include/__cxx03/__tuple/tuple_like_no_subrange.h +++ b/libcxx/include/__cxx03/__tuple/tuple_like_no_subrange.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___TUPLE_TUPLE_LIKE_NO_SUBRANGE_H #define _LIBCPP___TUPLE_TUPLE_LIKE_NO_SUBRANGE_H -#include <__config> -#include <__fwd/array.h> -#include <__fwd/complex.h> -#include <__fwd/pair.h> -#include <__fwd/tuple.h> -#include <__tuple/tuple_size.h> -#include <__type_traits/remove_cvref.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/array.h> +#include <__cxx03/__fwd/complex.h> +#include <__cxx03/__fwd/pair.h> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/tuple_size.h b/libcxx/include/__cxx03/__tuple/tuple_size.h index 18a17fd4d5878..ada7a522182e2 100644 --- a/libcxx/include/__cxx03/__tuple/tuple_size.h +++ b/libcxx/include/__cxx03/__tuple/tuple_size.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___TUPLE_TUPLE_SIZE_H #define _LIBCPP___TUPLE_TUPLE_SIZE_H -#include <__config> -#include <__fwd/tuple.h> -#include <__tuple/tuple_types.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_volatile.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/tuple_types.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_volatile.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__tuple/tuple_types.h b/libcxx/include/__cxx03/__tuple/tuple_types.h index 7e1256cf8790e..562d199a02ba2 100644 --- a/libcxx/include/__cxx03/__tuple/tuple_types.h +++ b/libcxx/include/__cxx03/__tuple/tuple_types.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TUPLE_TUPLE_TYPES_H #define _LIBCPP___TUPLE_TUPLE_TYPES_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/add_const.h b/libcxx/include/__cxx03/__type_traits/add_const.h index 9a6f1c10299f7..ce670824873ee 100644 --- a/libcxx/include/__cxx03/__type_traits/add_const.h +++ b/libcxx/include/__cxx03/__type_traits/add_const.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_ADD_CONST_H #define _LIBCPP___TYPE_TRAITS_ADD_CONST_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/add_cv.h b/libcxx/include/__cxx03/__type_traits/add_cv.h index 9e23e5ceb7a3b..43eb05fa40487 100644 --- a/libcxx/include/__cxx03/__type_traits/add_cv.h +++ b/libcxx/include/__cxx03/__type_traits/add_cv.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_ADD_CV_H #define _LIBCPP___TYPE_TRAITS_ADD_CV_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/add_lvalue_reference.h b/libcxx/include/__cxx03/__type_traits/add_lvalue_reference.h index a633e39045320..ca6ee5dc0b296 100644 --- a/libcxx/include/__cxx03/__type_traits/add_lvalue_reference.h +++ b/libcxx/include/__cxx03/__type_traits/add_lvalue_reference.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_ADD_LVALUE_REFERENCE_H #define _LIBCPP___TYPE_TRAITS_ADD_LVALUE_REFERENCE_H -#include <__config> -#include <__type_traits/is_referenceable.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_referenceable.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/add_pointer.h b/libcxx/include/__cxx03/__type_traits/add_pointer.h index 5aac7d5cfa90d..ec3ba5d8f404f 100644 --- a/libcxx/include/__cxx03/__type_traits/add_pointer.h +++ b/libcxx/include/__cxx03/__type_traits/add_pointer.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_ADD_POINTER_H #define _LIBCPP___TYPE_TRAITS_ADD_POINTER_H -#include <__config> -#include <__type_traits/is_referenceable.h> -#include <__type_traits/is_void.h> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_referenceable.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/add_rvalue_reference.h b/libcxx/include/__cxx03/__type_traits/add_rvalue_reference.h index a54aae7ec8de5..dc3be42ab0171 100644 --- a/libcxx/include/__cxx03/__type_traits/add_rvalue_reference.h +++ b/libcxx/include/__cxx03/__type_traits/add_rvalue_reference.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_ADD_RVALUE_REFERENCE_H #define _LIBCPP___TYPE_TRAITS_ADD_RVALUE_REFERENCE_H -#include <__config> -#include <__type_traits/is_referenceable.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_referenceable.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/add_volatile.h b/libcxx/include/__cxx03/__type_traits/add_volatile.h index 56b7dfaac026e..35ff8d7435c6d 100644 --- a/libcxx/include/__cxx03/__type_traits/add_volatile.h +++ b/libcxx/include/__cxx03/__type_traits/add_volatile.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_ADD_VOLATILE_H #define _LIBCPP___TYPE_TRAITS_ADD_VOLATILE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/aligned_storage.h b/libcxx/include/__cxx03/__type_traits/aligned_storage.h index 46aae12832f86..fc9fe4edd27ae 100644 --- a/libcxx/include/__cxx03/__type_traits/aligned_storage.h +++ b/libcxx/include/__cxx03/__type_traits/aligned_storage.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___TYPE_TRAITS_ALIGNED_STORAGE_H #define _LIBCPP___TYPE_TRAITS_ALIGNED_STORAGE_H -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/nat.h> -#include <__type_traits/type_list.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/nat.h> +#include <__cxx03/__type_traits/type_list.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/aligned_union.h b/libcxx/include/__cxx03/__type_traits/aligned_union.h index 005ed9a096ea8..d2c695d675381 100644 --- a/libcxx/include/__cxx03/__type_traits/aligned_union.h +++ b/libcxx/include/__cxx03/__type_traits/aligned_union.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_ALIGNED_UNION_H #define _LIBCPP___TYPE_TRAITS_ALIGNED_UNION_H -#include <__config> -#include <__type_traits/aligned_storage.h> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/aligned_storage.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/alignment_of.h b/libcxx/include/__cxx03/__type_traits/alignment_of.h index f2d069bf2488f..f2d5317d1a192 100644 --- a/libcxx/include/__cxx03/__type_traits/alignment_of.h +++ b/libcxx/include/__cxx03/__type_traits/alignment_of.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_ALIGNMENT_OF_H #define _LIBCPP___TYPE_TRAITS_ALIGNMENT_OF_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/can_extract_key.h b/libcxx/include/__cxx03/__type_traits/can_extract_key.h index b8359d0708810..8069ce25f4a67 100644 --- a/libcxx/include/__cxx03/__type_traits/can_extract_key.h +++ b/libcxx/include/__cxx03/__type_traits/can_extract_key.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TYPE_TRAITS_CAN_EXTRACT_KEY_H #define _LIBCPP___TYPE_TRAITS_CAN_EXTRACT_KEY_H -#include <__config> -#include <__fwd/pair.h> -#include <__type_traits/conditional.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_const_ref.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/pair.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_const_ref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/common_reference.h b/libcxx/include/__cxx03/__type_traits/common_reference.h index c802902eb19fc..ecaf83323f2f1 100644 --- a/libcxx/include/__cxx03/__type_traits/common_reference.h +++ b/libcxx/include/__cxx03/__type_traits/common_reference.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___TYPE_TRAITS_COMMON_REFERENCE_H #define _LIBCPP___TYPE_TRAITS_COMMON_REFERENCE_H -#include <__config> -#include <__type_traits/common_type.h> -#include <__type_traits/copy_cv.h> -#include <__type_traits/copy_cvref.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/copy_cv.h> +#include <__cxx03/__type_traits/copy_cvref.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/common_type.h b/libcxx/include/__cxx03/__type_traits/common_type.h index f6bd9ed71b7a4..9f7d0cef37faf 100644 --- a/libcxx/include/__cxx03/__type_traits/common_type.h +++ b/libcxx/include/__cxx03/__type_traits/common_type.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TYPE_TRAITS_COMMON_TYPE_H #define _LIBCPP___TYPE_TRAITS_COMMON_TYPE_H -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/conditional.h b/libcxx/include/__cxx03/__type_traits/conditional.h index 5b5445a837427..e03dcf0708016 100644 --- a/libcxx/include/__cxx03/__type_traits/conditional.h +++ b/libcxx/include/__cxx03/__type_traits/conditional.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_CONDITIONAL_H #define _LIBCPP___TYPE_TRAITS_CONDITIONAL_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/conjunction.h b/libcxx/include/__cxx03/__type_traits/conjunction.h index c2995591bbc28..9cb6a87764a95 100644 --- a/libcxx/include/__cxx03/__type_traits/conjunction.h +++ b/libcxx/include/__cxx03/__type_traits/conjunction.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___TYPE_TRAITS_CONJUNCTION_H #define _LIBCPP___TYPE_TRAITS_CONJUNCTION_H -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/copy_cv.h b/libcxx/include/__cxx03/__type_traits/copy_cv.h index d482cb42bffed..1ebda47a18bbe 100644 --- a/libcxx/include/__cxx03/__type_traits/copy_cv.h +++ b/libcxx/include/__cxx03/__type_traits/copy_cv.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_COPY_CV_H #define _LIBCPP___TYPE_TRAITS_COPY_CV_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/copy_cvref.h b/libcxx/include/__cxx03/__type_traits/copy_cvref.h index 8bbf8efdf44de..d852bcd3b99c0 100644 --- a/libcxx/include/__cxx03/__type_traits/copy_cvref.h +++ b/libcxx/include/__cxx03/__type_traits/copy_cvref.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_COPY_CVREF_H #define _LIBCPP___TYPE_TRAITS_COPY_CVREF_H -#include <__config> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/copy_cv.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_rvalue_reference.h> +#include <__cxx03/__type_traits/copy_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/datasizeof.h b/libcxx/include/__cxx03/__type_traits/datasizeof.h index a27baf67cc2d8..7dd87e162d341 100644 --- a/libcxx/include/__cxx03/__type_traits/datasizeof.h +++ b/libcxx/include/__cxx03/__type_traits/datasizeof.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_DATASIZEOF_H #define _LIBCPP___TYPE_TRAITS_DATASIZEOF_H -#include <__config> -#include <__type_traits/is_class.h> -#include <__type_traits/is_final.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_class.h> +#include <__cxx03/__type_traits/is_final.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/decay.h b/libcxx/include/__cxx03/__type_traits/decay.h index 7412044f93179..3563543a30912 100644 --- a/libcxx/include/__cxx03/__type_traits/decay.h +++ b/libcxx/include/__cxx03/__type_traits/decay.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___TYPE_TRAITS_DECAY_H #define _LIBCPP___TYPE_TRAITS_DECAY_H -#include <__config> -#include <__type_traits/add_pointer.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_referenceable.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_extent.h> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_referenceable.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_extent.h> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/dependent_type.h b/libcxx/include/__cxx03/__type_traits/dependent_type.h index db8a869820db3..fed130a8f101a 100644 --- a/libcxx/include/__cxx03/__type_traits/dependent_type.h +++ b/libcxx/include/__cxx03/__type_traits/dependent_type.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_DEPENDENT_TYPE_H #define _LIBCPP___TYPE_TRAITS_DEPENDENT_TYPE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/desugars_to.h b/libcxx/include/__cxx03/__type_traits/desugars_to.h index 97a2ee5448f20..e16fb59007fe9 100644 --- a/libcxx/include/__cxx03/__type_traits/desugars_to.h +++ b/libcxx/include/__cxx03/__type_traits/desugars_to.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_DESUGARS_TO_H #define _LIBCPP___TYPE_TRAITS_DESUGARS_TO_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/disjunction.h b/libcxx/include/__cxx03/__type_traits/disjunction.h index 2c89528d9f2fc..717a439669e1d 100644 --- a/libcxx/include/__cxx03/__type_traits/disjunction.h +++ b/libcxx/include/__cxx03/__type_traits/disjunction.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_DISJUNCTION_H #define _LIBCPP___TYPE_TRAITS_DISJUNCTION_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/enable_if.h b/libcxx/include/__cxx03/__type_traits/enable_if.h index 77da9622ca28f..d21c1301e0a2e 100644 --- a/libcxx/include/__cxx03/__type_traits/enable_if.h +++ b/libcxx/include/__cxx03/__type_traits/enable_if.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_ENABLE_IF_H #define _LIBCPP___TYPE_TRAITS_ENABLE_IF_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/extent.h b/libcxx/include/__cxx03/__type_traits/extent.h index bab03fe997eb6..d29fef59a5a9e 100644 --- a/libcxx/include/__cxx03/__type_traits/extent.h +++ b/libcxx/include/__cxx03/__type_traits/extent.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_EXTENT_H #define _LIBCPP___TYPE_TRAITS_EXTENT_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/has_unique_object_representation.h b/libcxx/include/__cxx03/__type_traits/has_unique_object_representation.h index 98c440c16bf26..4a507fe7fcac8 100644 --- a/libcxx/include/__cxx03/__type_traits/has_unique_object_representation.h +++ b/libcxx/include/__cxx03/__type_traits/has_unique_object_representation.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_HAS_UNIQUE_OBJECT_REPRESENTATION_H #define _LIBCPP___TYPE_TRAITS_HAS_UNIQUE_OBJECT_REPRESENTATION_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/remove_all_extents.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/remove_all_extents.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/has_virtual_destructor.h b/libcxx/include/__cxx03/__type_traits/has_virtual_destructor.h index 4ce96e649e67a..060bd1bfe58c8 100644 --- a/libcxx/include/__cxx03/__type_traits/has_virtual_destructor.h +++ b/libcxx/include/__cxx03/__type_traits/has_virtual_destructor.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_HAS_VIRTUAL_DESTRUCTOR_H #define _LIBCPP___TYPE_TRAITS_HAS_VIRTUAL_DESTRUCTOR_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/integral_constant.h b/libcxx/include/__cxx03/__type_traits/integral_constant.h index 23e87e27feff5..bfc11c8a45163 100644 --- a/libcxx/include/__cxx03/__type_traits/integral_constant.h +++ b/libcxx/include/__cxx03/__type_traits/integral_constant.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_INTEGRAL_CONSTANT_H #define _LIBCPP___TYPE_TRAITS_INTEGRAL_CONSTANT_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/invoke.h b/libcxx/include/__cxx03/__type_traits/invoke.h index 71db32ae6a3ce..9ece1c8749629 100644 --- a/libcxx/include/__cxx03/__type_traits/invoke.h +++ b/libcxx/include/__cxx03/__type_traits/invoke.h @@ -10,20 +10,20 @@ #ifndef _LIBCPP___TYPE_TRAITS_INVOKE_H #define _LIBCPP___TYPE_TRAITS_INVOKE_H -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_base_of.h> -#include <__type_traits/is_core_convertible.h> -#include <__type_traits/is_member_pointer.h> -#include <__type_traits/is_reference_wrapper.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_void.h> -#include <__type_traits/nat.h> -#include <__utility/declval.h> -#include <__utility/forward.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_base_of.h> +#include <__cxx03/__type_traits/is_core_convertible.h> +#include <__cxx03/__type_traits/is_member_pointer.h> +#include <__cxx03/__type_traits/is_reference_wrapper.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/nat.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_abstract.h b/libcxx/include/__cxx03/__type_traits/is_abstract.h index 4aa456be1c48e..785ae6772f710 100644 --- a/libcxx/include/__cxx03/__type_traits/is_abstract.h +++ b/libcxx/include/__cxx03/__type_traits/is_abstract.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_ABSTRACT_H #define _LIBCPP___TYPE_TRAITS_IS_ABSTRACT_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_aggregate.h b/libcxx/include/__cxx03/__type_traits/is_aggregate.h index 4e0988071adee..3da7efa0ce640 100644 --- a/libcxx/include/__cxx03/__type_traits/is_aggregate.h +++ b/libcxx/include/__cxx03/__type_traits/is_aggregate.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_AGGREGATE_H #define _LIBCPP___TYPE_TRAITS_IS_AGGREGATE_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_allocator.h b/libcxx/include/__cxx03/__type_traits/is_allocator.h index 144ffac4d7ce5..1b063e84e545f 100644 --- a/libcxx/include/__cxx03/__type_traits/is_allocator.h +++ b/libcxx/include/__cxx03/__type_traits/is_allocator.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___TYPE_IS_ALLOCATOR_H #define _LIBCPP___TYPE_IS_ALLOCATOR_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_always_bitcastable.h b/libcxx/include/__cxx03/__type_traits/is_always_bitcastable.h index 5bc650b41358a..6f5b6631ba40f 100644 --- a/libcxx/include/__cxx03/__type_traits/is_always_bitcastable.h +++ b/libcxx/include/__cxx03/__type_traits/is_always_bitcastable.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_ALWAYS_BITCASTABLE_H #define _LIBCPP___TYPE_TRAITS_IS_ALWAYS_BITCASTABLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_trivially_copyable.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_arithmetic.h b/libcxx/include/__cxx03/__type_traits/is_arithmetic.h index c9713e1840a7b..900a5b5b4176d 100644 --- a/libcxx/include/__cxx03/__type_traits/is_arithmetic.h +++ b/libcxx/include/__cxx03/__type_traits/is_arithmetic.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_ARITHMETIC_H #define _LIBCPP___TYPE_TRAITS_IS_ARITHMETIC_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_array.h b/libcxx/include/__cxx03/__type_traits/is_array.h index dc23de28d2c63..d4276ed61a4f6 100644 --- a/libcxx/include/__cxx03/__type_traits/is_array.h +++ b/libcxx/include/__cxx03/__type_traits/is_array.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_ARRAY_H #define _LIBCPP___TYPE_TRAITS_IS_ARRAY_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_assignable.h b/libcxx/include/__cxx03/__type_traits/is_assignable.h index cfb4699777878..ba1d5d1afcdd1 100644 --- a/libcxx/include/__cxx03/__type_traits/is_assignable.h +++ b/libcxx/include/__cxx03/__type_traits/is_assignable.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_ASSIGNABLE_H #define _LIBCPP___TYPE_TRAITS_IS_ASSIGNABLE_H -#include <__config> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_rvalue_reference.h> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_base_of.h b/libcxx/include/__cxx03/__type_traits/is_base_of.h index 090abeeb54dcc..0a31bfaa74127 100644 --- a/libcxx/include/__cxx03/__type_traits/is_base_of.h +++ b/libcxx/include/__cxx03/__type_traits/is_base_of.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_BASE_OF_H #define _LIBCPP___TYPE_TRAITS_IS_BASE_OF_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_bounded_array.h b/libcxx/include/__cxx03/__type_traits/is_bounded_array.h index 211403d638d08..aeb2235562838 100644 --- a/libcxx/include/__cxx03/__type_traits/is_bounded_array.h +++ b/libcxx/include/__cxx03/__type_traits/is_bounded_array.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_BOUNDED_ARRAY_H #define _LIBCPP___TYPE_TRAITS_IS_BOUNDED_ARRAY_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_callable.h b/libcxx/include/__cxx03/__type_traits/is_callable.h index 49724fe892ee5..4956d26b047fa 100644 --- a/libcxx/include/__cxx03/__type_traits/is_callable.h +++ b/libcxx/include/__cxx03/__type_traits/is_callable.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_CALLABLE_H #define _LIBCPP___TYPE_TRAITS_IS_CALLABLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_char_like_type.h b/libcxx/include/__cxx03/__type_traits/is_char_like_type.h index 26205843047ca..2896a5794301d 100644 --- a/libcxx/include/__cxx03/__type_traits/is_char_like_type.h +++ b/libcxx/include/__cxx03/__type_traits/is_char_like_type.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_CHAR_LIKE_TYPE_H #define _LIBCPP___TYPE_TRAITS_IS_CHAR_LIKE_TYPE_H -#include <__config> -#include <__type_traits/conjunction.h> -#include <__type_traits/is_standard_layout.h> -#include <__type_traits/is_trivial.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/is_standard_layout.h> +#include <__cxx03/__type_traits/is_trivial.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_class.h b/libcxx/include/__cxx03/__type_traits/is_class.h index 034f76a7865e3..c56f3679ecb35 100644 --- a/libcxx/include/__cxx03/__type_traits/is_class.h +++ b/libcxx/include/__cxx03/__type_traits/is_class.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_CLASS_H #define _LIBCPP___TYPE_TRAITS_IS_CLASS_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_compound.h b/libcxx/include/__cxx03/__type_traits/is_compound.h index cd208ceab2886..c48c88643c8e1 100644 --- a/libcxx/include/__cxx03/__type_traits/is_compound.h +++ b/libcxx/include/__cxx03/__type_traits/is_compound.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_COMPOUND_H #define _LIBCPP___TYPE_TRAITS_IS_COMPOUND_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_fundamental.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_fundamental.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_const.h b/libcxx/include/__cxx03/__type_traits/is_const.h index 47ef70872b790..2b297ad472bd5 100644 --- a/libcxx/include/__cxx03/__type_traits/is_const.h +++ b/libcxx/include/__cxx03/__type_traits/is_const.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_CONST_H #define _LIBCPP___TYPE_TRAITS_IS_CONST_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_constant_evaluated.h b/libcxx/include/__cxx03/__type_traits/is_constant_evaluated.h index 05e070a747884..9abd5ee9ce41a 100644 --- a/libcxx/include/__cxx03/__type_traits/is_constant_evaluated.h +++ b/libcxx/include/__cxx03/__type_traits/is_constant_evaluated.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_CONSTANT_EVALUATED_H #define _LIBCPP___TYPE_TRAITS_IS_CONSTANT_EVALUATED_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_constructible.h b/libcxx/include/__cxx03/__type_traits/is_constructible.h index 567bd165c7152..18c58311ed033 100644 --- a/libcxx/include/__cxx03/__type_traits/is_constructible.h +++ b/libcxx/include/__cxx03/__type_traits/is_constructible.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_IS_CONSTRUCTIBLE_H #define _LIBCPP___TYPE_IS_CONSTRUCTIBLE_H -#include <__config> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_rvalue_reference.h> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_convertible.h b/libcxx/include/__cxx03/__type_traits/is_convertible.h index 414c2a6d6a0de..a2b2630f109be 100644 --- a/libcxx/include/__cxx03/__type_traits/is_convertible.h +++ b/libcxx/include/__cxx03/__type_traits/is_convertible.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_CONVERTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_CONVERTIBLE_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_core_convertible.h b/libcxx/include/__cxx03/__type_traits/is_core_convertible.h index 0de177c7771f4..8088dfb7d9589 100644 --- a/libcxx/include/__cxx03/__type_traits/is_core_convertible.h +++ b/libcxx/include/__cxx03/__type_traits/is_core_convertible.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_CORE_CONVERTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_CORE_CONVERTIBLE_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_destructible.h b/libcxx/include/__cxx03/__type_traits/is_destructible.h index 3248b07d36ee6..ce94e8c27f195 100644 --- a/libcxx/include/__cxx03/__type_traits/is_destructible.h +++ b/libcxx/include/__cxx03/__type_traits/is_destructible.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_DESTRUCTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_DESTRUCTIBLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_all_extents.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_all_extents.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_empty.h b/libcxx/include/__cxx03/__type_traits/is_empty.h index 951d93b5a2f10..89cb681712d31 100644 --- a/libcxx/include/__cxx03/__type_traits/is_empty.h +++ b/libcxx/include/__cxx03/__type_traits/is_empty.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_EMPTY_H #define _LIBCPP___TYPE_TRAITS_IS_EMPTY_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_enum.h b/libcxx/include/__cxx03/__type_traits/is_enum.h index 2fab6db2c8d50..da08f569586d6 100644 --- a/libcxx/include/__cxx03/__type_traits/is_enum.h +++ b/libcxx/include/__cxx03/__type_traits/is_enum.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_ENUM_H #define _LIBCPP___TYPE_TRAITS_IS_ENUM_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_equality_comparable.h b/libcxx/include/__cxx03/__type_traits/is_equality_comparable.h index 4397f743e5ee9..7948482694132 100644 --- a/libcxx/include/__cxx03/__type_traits/is_equality_comparable.h +++ b/libcxx/include/__cxx03/__type_traits/is_equality_comparable.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_EQUALITY_COMPARABLE_H #define _LIBCPP___TYPE_TRAITS_IS_EQUALITY_COMPARABLE_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_signed.h> -#include <__type_traits/is_void.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_execution_policy.h b/libcxx/include/__cxx03/__type_traits/is_execution_policy.h index 6884f17ba16c8..d0da5bb45a70f 100644 --- a/libcxx/include/__cxx03/__type_traits/is_execution_policy.h +++ b/libcxx/include/__cxx03/__type_traits/is_execution_policy.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_EXECUTION_POLICY_H #define _LIBCPP___TYPE_TRAITS_IS_EXECUTION_POLICY_H -#include <__config> -#include <__type_traits/remove_cvref.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/remove_cvref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_final.h b/libcxx/include/__cxx03/__type_traits/is_final.h index 499c5e3a1edca..b55fadce9efce 100644 --- a/libcxx/include/__cxx03/__type_traits/is_final.h +++ b/libcxx/include/__cxx03/__type_traits/is_final.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_FINAL_H #define _LIBCPP___TYPE_TRAITS_IS_FINAL_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_floating_point.h b/libcxx/include/__cxx03/__type_traits/is_floating_point.h index add34782dfa09..99898272651c4 100644 --- a/libcxx/include/__cxx03/__type_traits/is_floating_point.h +++ b/libcxx/include/__cxx03/__type_traits/is_floating_point.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_FLOATING_POINT_H #define _LIBCPP___TYPE_TRAITS_IS_FLOATING_POINT_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_function.h b/libcxx/include/__cxx03/__type_traits/is_function.h index 98fedd0ad96d9..0337dfac18c68 100644 --- a/libcxx/include/__cxx03/__type_traits/is_function.h +++ b/libcxx/include/__cxx03/__type_traits/is_function.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_FUNCTIONAL_H #define _LIBCPP___TYPE_TRAITS_IS_FUNCTIONAL_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_fundamental.h b/libcxx/include/__cxx03/__type_traits/is_fundamental.h index 55f8e41f75f45..cc57ac7b01948 100644 --- a/libcxx/include/__cxx03/__type_traits/is_fundamental.h +++ b/libcxx/include/__cxx03/__type_traits/is_fundamental.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_FUNDAMENTAL_H #define _LIBCPP___TYPE_TRAITS_IS_FUNDAMENTAL_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_null_pointer.h> -#include <__type_traits/is_void.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_null_pointer.h> +#include <__cxx03/__type_traits/is_void.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_implicitly_default_constructible.h b/libcxx/include/__cxx03/__type_traits/is_implicitly_default_constructible.h index d5dadd7b870dd..b9bf0523d937c 100644 --- a/libcxx/include/__cxx03/__type_traits/is_implicitly_default_constructible.h +++ b/libcxx/include/__cxx03/__type_traits/is_implicitly_default_constructible.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_IMPLICITLY_DEFAULT_CONSTRUCTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_IMPLICITLY_DEFAULT_CONSTRUCTIBLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_constructible.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_constructible.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_integral.h b/libcxx/include/__cxx03/__type_traits/is_integral.h index 26969885af8df..680875c0e737d 100644 --- a/libcxx/include/__cxx03/__type_traits/is_integral.h +++ b/libcxx/include/__cxx03/__type_traits/is_integral.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_INTEGRAL_H #define _LIBCPP___TYPE_TRAITS_IS_INTEGRAL_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_literal_type.h b/libcxx/include/__cxx03/__type_traits/is_literal_type.h index 10e23bceffbda..ad7ad3f2eb3cf 100644 --- a/libcxx/include/__cxx03/__type_traits/is_literal_type.h +++ b/libcxx/include/__cxx03/__type_traits/is_literal_type.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_LITERAL_TYPE #define _LIBCPP___TYPE_TRAITS_IS_LITERAL_TYPE -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_member_pointer.h b/libcxx/include/__cxx03/__type_traits/is_member_pointer.h index 3e2753ac4228c..0eba57ddb827c 100644 --- a/libcxx/include/__cxx03/__type_traits/is_member_pointer.h +++ b/libcxx/include/__cxx03/__type_traits/is_member_pointer.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_MEMBER_POINTER_H #define _LIBCPP___TYPE_TRAITS_IS_MEMBER_POINTER_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_nothrow_assignable.h b/libcxx/include/__cxx03/__type_traits/is_nothrow_assignable.h index 7e00c741f83e3..adf9be8e02518 100644 --- a/libcxx/include/__cxx03/__type_traits/is_nothrow_assignable.h +++ b/libcxx/include/__cxx03/__type_traits/is_nothrow_assignable.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_ASSIGNABLE_H #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_ASSIGNABLE_H -#include <__config> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_rvalue_reference.h> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_nothrow_constructible.h b/libcxx/include/__cxx03/__type_traits/is_nothrow_constructible.h index 58d2b2475140b..bbf0220c57153 100644 --- a/libcxx/include/__cxx03/__type_traits/is_nothrow_constructible.h +++ b/libcxx/include/__cxx03/__type_traits/is_nothrow_constructible.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_CONSTRUCTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_CONSTRUCTIBLE_H -#include <__config> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_rvalue_reference.h> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_nothrow_convertible.h b/libcxx/include/__cxx03/__type_traits/is_nothrow_convertible.h index bfc5a94cbadec..b94727fce49e1 100644 --- a/libcxx/include/__cxx03/__type_traits/is_nothrow_convertible.h +++ b/libcxx/include/__cxx03/__type_traits/is_nothrow_convertible.h @@ -9,14 +9,14 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_CONVERTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_CONVERTIBLE_H -#include <__config> -#include <__type_traits/conjunction.h> -#include <__type_traits/disjunction.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_void.h> -#include <__type_traits/lazy.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/disjunction.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/lazy.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_nothrow_destructible.h b/libcxx/include/__cxx03/__type_traits/is_nothrow_destructible.h index c2d5ca87232aa..b84523f40b430 100644 --- a/libcxx/include/__cxx03/__type_traits/is_nothrow_destructible.h +++ b/libcxx/include/__cxx03/__type_traits/is_nothrow_destructible.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DESTRUCTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DESTRUCTIBLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_destructible.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_destructible.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_null_pointer.h b/libcxx/include/__cxx03/__type_traits/is_null_pointer.h index 9f5697e232684..9c605450638c3 100644 --- a/libcxx/include/__cxx03/__type_traits/is_null_pointer.h +++ b/libcxx/include/__cxx03/__type_traits/is_null_pointer.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_NULL_POINTER_H #define _LIBCPP___TYPE_TRAITS_IS_NULL_POINTER_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_object.h b/libcxx/include/__cxx03/__type_traits/is_object.h index ec04508402ce5..c61c67dc51248 100644 --- a/libcxx/include/__cxx03/__type_traits/is_object.h +++ b/libcxx/include/__cxx03/__type_traits/is_object.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_OBJECT_H #define _LIBCPP___TYPE_TRAITS_IS_OBJECT_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_pod.h b/libcxx/include/__cxx03/__type_traits/is_pod.h index 5888fbf457d8b..3381a61657841 100644 --- a/libcxx/include/__cxx03/__type_traits/is_pod.h +++ b/libcxx/include/__cxx03/__type_traits/is_pod.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_POD_H #define _LIBCPP___TYPE_TRAITS_IS_POD_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_pointer.h b/libcxx/include/__cxx03/__type_traits/is_pointer.h index 38eb7996c6864..7afce1a07f281 100644 --- a/libcxx/include/__cxx03/__type_traits/is_pointer.h +++ b/libcxx/include/__cxx03/__type_traits/is_pointer.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_POINTER_H #define _LIBCPP___TYPE_TRAITS_IS_POINTER_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_polymorphic.h b/libcxx/include/__cxx03/__type_traits/is_polymorphic.h index d122e1c87775b..577cfdbbf32c7 100644 --- a/libcxx/include/__cxx03/__type_traits/is_polymorphic.h +++ b/libcxx/include/__cxx03/__type_traits/is_polymorphic.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_POLYMORPHIC_H #define _LIBCPP___TYPE_TRAITS_IS_POLYMORPHIC_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_primary_template.h b/libcxx/include/__cxx03/__type_traits/is_primary_template.h index f308dfadc8ec8..021cd9744471a 100644 --- a/libcxx/include/__cxx03/__type_traits/is_primary_template.h +++ b/libcxx/include/__cxx03/__type_traits/is_primary_template.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_PRIMARY_TEMPLATE_H #define _LIBCPP___TYPE_TRAITS_IS_PRIMARY_TEMPLATE_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_valid_expansion.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_valid_expansion.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_reference.h b/libcxx/include/__cxx03/__type_traits/is_reference.h index cc157a438e491..cca4d1ccdac4b 100644 --- a/libcxx/include/__cxx03/__type_traits/is_reference.h +++ b/libcxx/include/__cxx03/__type_traits/is_reference.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_REFERENCE_H #define _LIBCPP___TYPE_TRAITS_IS_REFERENCE_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_reference_wrapper.h b/libcxx/include/__cxx03/__type_traits/is_reference_wrapper.h index 310a910040e8b..e3f06df804005 100644 --- a/libcxx/include/__cxx03/__type_traits/is_reference_wrapper.h +++ b/libcxx/include/__cxx03/__type_traits/is_reference_wrapper.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_REFERENCE_WRAPPER_H #define _LIBCPP___TYPE_TRAITS_IS_REFERENCE_WRAPPER_H -#include <__config> -#include <__fwd/functional.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_referenceable.h b/libcxx/include/__cxx03/__type_traits/is_referenceable.h index 4b34ec2572317..13ef875538488 100644 --- a/libcxx/include/__cxx03/__type_traits/is_referenceable.h +++ b/libcxx/include/__cxx03/__type_traits/is_referenceable.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_REFERENCEABLE_H #define _LIBCPP___TYPE_TRAITS_IS_REFERENCEABLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_same.h b/libcxx/include/__cxx03/__type_traits/is_same.h index 9561b7b5d6da3..29ff5116e8e89 100644 --- a/libcxx/include/__cxx03/__type_traits/is_same.h +++ b/libcxx/include/__cxx03/__type_traits/is_same.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_SAME_H #define _LIBCPP___TYPE_TRAITS_IS_SAME_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_scalar.h b/libcxx/include/__cxx03/__type_traits/is_scalar.h index 455200de47208..49d793ad06e71 100644 --- a/libcxx/include/__cxx03/__type_traits/is_scalar.h +++ b/libcxx/include/__cxx03/__type_traits/is_scalar.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_SCALAR_H #define _LIBCPP___TYPE_TRAITS_IS_SCALAR_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_enum.h> -#include <__type_traits/is_member_pointer.h> -#include <__type_traits/is_null_pointer.h> -#include <__type_traits/is_pointer.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_enum.h> +#include <__cxx03/__type_traits/is_member_pointer.h> +#include <__cxx03/__type_traits/is_null_pointer.h> +#include <__cxx03/__type_traits/is_pointer.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_signed.h b/libcxx/include/__cxx03/__type_traits/is_signed.h index fd6f93e182362..6f1cc05aaf373 100644 --- a/libcxx/include/__cxx03/__type_traits/is_signed.h +++ b/libcxx/include/__cxx03/__type_traits/is_signed.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_SIGNED_H #define _LIBCPP___TYPE_TRAITS_IS_SIGNED_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_signed_integer.h b/libcxx/include/__cxx03/__type_traits/is_signed_integer.h index 467548d0aaafb..24b1aae258352 100644 --- a/libcxx/include/__cxx03/__type_traits/is_signed_integer.h +++ b/libcxx/include/__cxx03/__type_traits/is_signed_integer.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_SIGNED_INTEGER_H #define _LIBCPP___TYPE_TRAITS_IS_SIGNED_INTEGER_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_specialization.h b/libcxx/include/__cxx03/__type_traits/is_specialization.h index 9b75636b1a511..bccdd77df0e56 100644 --- a/libcxx/include/__cxx03/__type_traits/is_specialization.h +++ b/libcxx/include/__cxx03/__type_traits/is_specialization.h @@ -22,7 +22,7 @@ // // Note a cvref qualified _Tp is never considered a specialization. -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_standard_layout.h b/libcxx/include/__cxx03/__type_traits/is_standard_layout.h index 76484f3e2a301..1325f72478bdf 100644 --- a/libcxx/include/__cxx03/__type_traits/is_standard_layout.h +++ b/libcxx/include/__cxx03/__type_traits/is_standard_layout.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_STANDARD_LAYOUT_H #define _LIBCPP___TYPE_TRAITS_IS_STANDARD_LAYOUT_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_swappable.h b/libcxx/include/__cxx03/__type_traits/is_swappable.h index 0b817e6509933..9c2e373675d8c 100644 --- a/libcxx/include/__cxx03/__type_traits/is_swappable.h +++ b/libcxx/include/__cxx03/__type_traits/is_swappable.h @@ -9,16 +9,16 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_SWAPPABLE_H #define _LIBCPP___TYPE_TRAITS_IS_SWAPPABLE_H -#include <__config> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_trivial.h b/libcxx/include/__cxx03/__type_traits/is_trivial.h index 0007c7446d5e5..acb7a731ead52 100644 --- a/libcxx/include/__cxx03/__type_traits/is_trivial.h +++ b/libcxx/include/__cxx03/__type_traits/is_trivial.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIAL_H #define _LIBCPP___TYPE_TRAITS_IS_TRIVIAL_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_trivially_assignable.h b/libcxx/include/__cxx03/__type_traits/is_trivially_assignable.h index 201333b0fa0b3..757652572c81e 100644 --- a/libcxx/include/__cxx03/__type_traits/is_trivially_assignable.h +++ b/libcxx/include/__cxx03/__type_traits/is_trivially_assignable.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_ASSIGNABLE_H #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_ASSIGNABLE_H -#include <__config> -#include <__type_traits/add_const.h> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_const.h> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_rvalue_reference.h> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_trivially_constructible.h b/libcxx/include/__cxx03/__type_traits/is_trivially_constructible.h index 3a77e9fe164da..33110203fb422 100644 --- a/libcxx/include/__cxx03/__type_traits/is_trivially_constructible.h +++ b/libcxx/include/__cxx03/__type_traits/is_trivially_constructible.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_CONSTRUCTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_CONSTRUCTIBLE_H -#include <__config> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_rvalue_reference.h> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_trivially_copyable.h b/libcxx/include/__cxx03/__type_traits/is_trivially_copyable.h index e92af126ee94d..c80688b38c43d 100644 --- a/libcxx/include/__cxx03/__type_traits/is_trivially_copyable.h +++ b/libcxx/include/__cxx03/__type_traits/is_trivially_copyable.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_trivially_destructible.h b/libcxx/include/__cxx03/__type_traits/is_trivially_destructible.h index 5f9652f2a5011..7c45619ea22c3 100644 --- a/libcxx/include/__cxx03/__type_traits/is_trivially_destructible.h +++ b/libcxx/include/__cxx03/__type_traits/is_trivially_destructible.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_DESTRUCTIBLE_H #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_DESTRUCTIBLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_destructible.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_destructible.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_trivially_lexicographically_comparable.h b/libcxx/include/__cxx03/__type_traits/is_trivially_lexicographically_comparable.h index a310ea1b87e30..45043370a19e1 100644 --- a/libcxx/include/__cxx03/__type_traits/is_trivially_lexicographically_comparable.h +++ b/libcxx/include/__cxx03/__type_traits/is_trivially_lexicographically_comparable.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_LEXICOGRAPHICALLY_COMPARABLE_H #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_LEXICOGRAPHICALLY_COMPARABLE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_unsigned.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_trivially_relocatable.h b/libcxx/include/__cxx03/__type_traits/is_trivially_relocatable.h index c0871731cc001..d134218cba3ff 100644 --- a/libcxx/include/__cxx03/__type_traits/is_trivially_relocatable.h +++ b/libcxx/include/__cxx03/__type_traits/is_trivially_relocatable.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_RELOCATABLE_H #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_RELOCATABLE_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_trivially_copyable.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_unbounded_array.h b/libcxx/include/__cxx03/__type_traits/is_unbounded_array.h index d58bb09e10428..e7262f5630287 100644 --- a/libcxx/include/__cxx03/__type_traits/is_unbounded_array.h +++ b/libcxx/include/__cxx03/__type_traits/is_unbounded_array.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_UNBOUNDED_ARRAY_H #define _LIBCPP___TYPE_TRAITS_IS_UNBOUNDED_ARRAY_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_union.h b/libcxx/include/__cxx03/__type_traits/is_union.h index 1f009d993545b..03733e6dac236 100644 --- a/libcxx/include/__cxx03/__type_traits/is_union.h +++ b/libcxx/include/__cxx03/__type_traits/is_union.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_UNION_H #define _LIBCPP___TYPE_TRAITS_IS_UNION_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_unsigned.h b/libcxx/include/__cxx03/__type_traits/is_unsigned.h index 48c5751ed70d8..b302380cd7838 100644 --- a/libcxx/include/__cxx03/__type_traits/is_unsigned.h +++ b/libcxx/include/__cxx03/__type_traits/is_unsigned.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_H #define _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_integral.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_integral.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_unsigned_integer.h b/libcxx/include/__cxx03/__type_traits/is_unsigned_integer.h index 265894b32d4fc..7a0508289ed64 100644 --- a/libcxx/include/__cxx03/__type_traits/is_unsigned_integer.h +++ b/libcxx/include/__cxx03/__type_traits/is_unsigned_integer.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H #define _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_valid_expansion.h b/libcxx/include/__cxx03/__type_traits/is_valid_expansion.h index 346bc98b110ff..a2a4b30ff2f4f 100644 --- a/libcxx/include/__cxx03/__type_traits/is_valid_expansion.h +++ b/libcxx/include/__cxx03/__type_traits/is_valid_expansion.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_VALID_EXPANSION_H #define _LIBCPP___TYPE_TRAITS_IS_VALID_EXPANSION_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_void.h b/libcxx/include/__cxx03/__type_traits/is_void.h index 562faae9fba2c..aa6712e19c33f 100644 --- a/libcxx/include/__cxx03/__type_traits/is_void.h +++ b/libcxx/include/__cxx03/__type_traits/is_void.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_VOID_H #define _LIBCPP___TYPE_TRAITS_IS_VOID_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/is_volatile.h b/libcxx/include/__cxx03/__type_traits/is_volatile.h index 87960a819c8fc..43f9d089d2600 100644 --- a/libcxx/include/__cxx03/__type_traits/is_volatile.h +++ b/libcxx/include/__cxx03/__type_traits/is_volatile.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_IS_VOLATILE_H #define _LIBCPP___TYPE_TRAITS_IS_VOLATILE_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/lazy.h b/libcxx/include/__cxx03/__type_traits/lazy.h index 80826f1d64f60..4eb5893671f13 100644 --- a/libcxx/include/__cxx03/__type_traits/lazy.h +++ b/libcxx/include/__cxx03/__type_traits/lazy.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_LAZY_H #define _LIBCPP___TYPE_TRAITS_LAZY_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/make_32_64_or_128_bit.h b/libcxx/include/__cxx03/__type_traits/make_32_64_or_128_bit.h index f7f2e81735daf..e1be7d2022edb 100644 --- a/libcxx/include/__cxx03/__type_traits/make_32_64_or_128_bit.h +++ b/libcxx/include/__cxx03/__type_traits/make_32_64_or_128_bit.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TYPE_TRAITS_MAKE_32_64_OR_128_BIT_H #define _LIBCPP___TYPE_TRAITS_MAKE_32_64_OR_128_BIT_H -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_signed.h> -#include <__type_traits/is_unsigned.h> -#include <__type_traits/make_unsigned.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/make_const_lvalue_ref.h b/libcxx/include/__cxx03/__type_traits/make_const_lvalue_ref.h index 469d4cb31ef7d..cf9a46262a4b0 100644 --- a/libcxx/include/__cxx03/__type_traits/make_const_lvalue_ref.h +++ b/libcxx/include/__cxx03/__type_traits/make_const_lvalue_ref.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_MAKE_CONST_LVALUE_REF_H #define _LIBCPP___TYPE_TRAITS_MAKE_CONST_LVALUE_REF_H -#include <__config> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/make_signed.h b/libcxx/include/__cxx03/__type_traits/make_signed.h index d09d6ed4a1e7c..6fc97078d6536 100644 --- a/libcxx/include/__cxx03/__type_traits/make_signed.h +++ b/libcxx/include/__cxx03/__type_traits/make_signed.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TYPE_TRAITS_MAKE_SIGNED_H #define _LIBCPP___TYPE_TRAITS_MAKE_SIGNED_H -#include <__config> -#include <__type_traits/copy_cv.h> -#include <__type_traits/is_enum.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/nat.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/type_list.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/copy_cv.h> +#include <__cxx03/__type_traits/is_enum.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/nat.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/type_list.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/make_unsigned.h b/libcxx/include/__cxx03/__type_traits/make_unsigned.h index 282cd2d911316..1b0312d61f9b4 100644 --- a/libcxx/include/__cxx03/__type_traits/make_unsigned.h +++ b/libcxx/include/__cxx03/__type_traits/make_unsigned.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___TYPE_TRAITS_MAKE_UNSIGNED_H #define _LIBCPP___TYPE_TRAITS_MAKE_UNSIGNED_H -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/copy_cv.h> -#include <__type_traits/is_enum.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_unsigned.h> -#include <__type_traits/nat.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/type_list.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/copy_cv.h> +#include <__cxx03/__type_traits/is_enum.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/__type_traits/nat.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/type_list.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/maybe_const.h b/libcxx/include/__cxx03/__type_traits/maybe_const.h index 25fba58fb7730..97928110edf24 100644 --- a/libcxx/include/__cxx03/__type_traits/maybe_const.h +++ b/libcxx/include/__cxx03/__type_traits/maybe_const.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_MAYBE_CONST_H #define _LIBCPP___TYPE_TRAITS_MAYBE_CONST_H -#include <__config> -#include <__type_traits/conditional.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/nat.h b/libcxx/include/__cxx03/__type_traits/nat.h index 9f39b806814e2..e476364c4950e 100644 --- a/libcxx/include/__cxx03/__type_traits/nat.h +++ b/libcxx/include/__cxx03/__type_traits/nat.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_NAT_H #define _LIBCPP___TYPE_TRAITS_NAT_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/negation.h b/libcxx/include/__cxx03/__type_traits/negation.h index a72e62d3f96e0..56559071e62f7 100644 --- a/libcxx/include/__cxx03/__type_traits/negation.h +++ b/libcxx/include/__cxx03/__type_traits/negation.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_NEGATION_H #define _LIBCPP___TYPE_TRAITS_NEGATION_H -#include <__config> -#include <__type_traits/integral_constant.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/noexcept_move_assign_container.h b/libcxx/include/__cxx03/__type_traits/noexcept_move_assign_container.h index baaf36d9980e9..4684440a5760f 100644 --- a/libcxx/include/__cxx03/__type_traits/noexcept_move_assign_container.h +++ b/libcxx/include/__cxx03/__type_traits/noexcept_move_assign_container.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H #define _LIBCPP___TYPE_TRAITS_NOEXCEPT_MOVE_ASSIGN_CONTAINER_H -#include <__config> -#include <__memory/allocator_traits.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/promote.h b/libcxx/include/__cxx03/__type_traits/promote.h index 2b2a6843b9150..b890aecdb5eb8 100644 --- a/libcxx/include/__cxx03/__type_traits/promote.h +++ b/libcxx/include/__cxx03/__type_traits/promote.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___TYPE_TRAITS_PROMOTE_H #define _LIBCPP___TYPE_TRAITS_PROMOTE_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_arithmetic.h> #if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER == 1700 -# include <__type_traits/is_same.h> -# include <__utility/declval.h> +# include <__cxx03/__type_traits/is_same.h> +# include <__cxx03/__utility/declval.h> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/__type_traits/rank.h b/libcxx/include/__cxx03/__type_traits/rank.h index 7f6fad1c54024..6f2109b6541f7 100644 --- a/libcxx/include/__cxx03/__type_traits/rank.h +++ b/libcxx/include/__cxx03/__type_traits/rank.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_RANK_H #define _LIBCPP___TYPE_TRAITS_RANK_H -#include <__config> -#include <__type_traits/integral_constant.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_all_extents.h b/libcxx/include/__cxx03/__type_traits/remove_all_extents.h index d5373b51f5221..3d06a52117889 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_all_extents.h +++ b/libcxx/include/__cxx03/__type_traits/remove_all_extents.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_ALL_EXTENTS_H #define _LIBCPP___TYPE_TRAITS_REMOVE_ALL_EXTENTS_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_const.h b/libcxx/include/__cxx03/__type_traits/remove_const.h index a3f0648c47850..06a70a260080c 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_const.h +++ b/libcxx/include/__cxx03/__type_traits/remove_const.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_CONST_H #define _LIBCPP___TYPE_TRAITS_REMOVE_CONST_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_const_ref.h b/libcxx/include/__cxx03/__type_traits/remove_const_ref.h index d3b334935a5b1..c7d058dc2a65e 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_const_ref.h +++ b/libcxx/include/__cxx03/__type_traits/remove_const_ref.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_CONST_REF_H #define _LIBCPP___TYPE_TRAITS_REMOVE_CONST_REF_H -#include <__config> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_cv.h b/libcxx/include/__cxx03/__type_traits/remove_cv.h index c4bf612794bd5..34bce7cf45ff1 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_cv.h +++ b/libcxx/include/__cxx03/__type_traits/remove_cv.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_CV_H #define _LIBCPP___TYPE_TRAITS_REMOVE_CV_H -#include <__config> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_volatile.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_volatile.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_cvref.h b/libcxx/include/__cxx03/__type_traits/remove_cvref.h index e8e8745ab0960..38a37d730e44a 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_cvref.h +++ b/libcxx/include/__cxx03/__type_traits/remove_cvref.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_CVREF_H #define _LIBCPP___TYPE_TRAITS_REMOVE_CVREF_H -#include <__config> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_extent.h b/libcxx/include/__cxx03/__type_traits/remove_extent.h index fe37b5c7266c6..b1ccdc8a075d5 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_extent.h +++ b/libcxx/include/__cxx03/__type_traits/remove_extent.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_EXTENT_H #define _LIBCPP___TYPE_TRAITS_REMOVE_EXTENT_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_pointer.h b/libcxx/include/__cxx03/__type_traits/remove_pointer.h index 1048f67055a28..965ddf01a30f7 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_pointer.h +++ b/libcxx/include/__cxx03/__type_traits/remove_pointer.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_POINTER_H #define _LIBCPP___TYPE_TRAITS_REMOVE_POINTER_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_reference.h b/libcxx/include/__cxx03/__type_traits/remove_reference.h index ba67891758adc..768b9acd2b687 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_reference.h +++ b/libcxx/include/__cxx03/__type_traits/remove_reference.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_REFERENCE_H #define _LIBCPP___TYPE_TRAITS_REMOVE_REFERENCE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/remove_volatile.h b/libcxx/include/__cxx03/__type_traits/remove_volatile.h index 7600ae0ec5167..95fc78ee8e41c 100644 --- a/libcxx/include/__cxx03/__type_traits/remove_volatile.h +++ b/libcxx/include/__cxx03/__type_traits/remove_volatile.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_REMOVE_VOLATILE_H #define _LIBCPP___TYPE_TRAITS_REMOVE_VOLATILE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/result_of.h b/libcxx/include/__cxx03/__type_traits/result_of.h index f00fa8e9be7f7..7398f52f8965d 100644 --- a/libcxx/include/__cxx03/__type_traits/result_of.h +++ b/libcxx/include/__cxx03/__type_traits/result_of.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_RESULT_OF_H #define _LIBCPP___TYPE_TRAITS_RESULT_OF_H -#include <__config> -#include <__functional/invoke.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/strip_signature.h b/libcxx/include/__cxx03/__type_traits/strip_signature.h index 3fe79592f55b8..d728604ac2fec 100644 --- a/libcxx/include/__cxx03/__type_traits/strip_signature.h +++ b/libcxx/include/__cxx03/__type_traits/strip_signature.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_STRIP_SIGNATURE_H #define _LIBCPP___TYPE_TRAITS_STRIP_SIGNATURE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/type_identity.h b/libcxx/include/__cxx03/__type_traits/type_identity.h index b0b5a1277d596..27bbd7c51d426 100644 --- a/libcxx/include/__cxx03/__type_traits/type_identity.h +++ b/libcxx/include/__cxx03/__type_traits/type_identity.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_TYPE_IDENTITY_H #define _LIBCPP___TYPE_TRAITS_TYPE_IDENTITY_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/type_list.h b/libcxx/include/__cxx03/__type_traits/type_list.h index 02905707ee37a..31717a60b13cf 100644 --- a/libcxx/include/__cxx03/__type_traits/type_list.h +++ b/libcxx/include/__cxx03/__type_traits/type_list.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_TYPE_LIST_H #define _LIBCPP___TYPE_TRAITS_TYPE_LIST_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/underlying_type.h b/libcxx/include/__cxx03/__type_traits/underlying_type.h index 16e7501dee17d..da9d3df264526 100644 --- a/libcxx/include/__cxx03/__type_traits/underlying_type.h +++ b/libcxx/include/__cxx03/__type_traits/underlying_type.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___TYPE_TRAITS_UNDERLYING_TYPE_H #define _LIBCPP___TYPE_TRAITS_UNDERLYING_TYPE_H -#include <__config> -#include <__type_traits/is_enum.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_enum.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/unwrap_ref.h b/libcxx/include/__cxx03/__type_traits/unwrap_ref.h index 6bd74550f3092..034bf337518de 100644 --- a/libcxx/include/__cxx03/__type_traits/unwrap_ref.h +++ b/libcxx/include/__cxx03/__type_traits/unwrap_ref.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___TYPE_TRAITS_UNWRAP_REF_H #define _LIBCPP___TYPE_TRAITS_UNWRAP_REF_H -#include <__config> -#include <__fwd/functional.h> -#include <__type_traits/decay.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__type_traits/decay.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__type_traits/void_t.h b/libcxx/include/__cxx03/__type_traits/void_t.h index 985bba02e72f7..2fa557ab38178 100644 --- a/libcxx/include/__cxx03/__type_traits/void_t.h +++ b/libcxx/include/__cxx03/__type_traits/void_t.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___TYPE_TRAITS_VOID_T_H #define _LIBCPP___TYPE_TRAITS_VOID_T_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/as_const.h b/libcxx/include/__cxx03/__utility/as_const.h index 582dd42f40791..33c4686d663e6 100644 --- a/libcxx/include/__cxx03/__utility/as_const.h +++ b/libcxx/include/__cxx03/__utility/as_const.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___UTILITY_AS_CONST_H #define _LIBCPP___UTILITY_AS_CONST_H -#include <__config> -#include <__type_traits/add_const.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/add_const.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/as_lvalue.h b/libcxx/include/__cxx03/__utility/as_lvalue.h index 159f45dad4d41..95ce9497207d2 100644 --- a/libcxx/include/__cxx03/__utility/as_lvalue.h +++ b/libcxx/include/__cxx03/__utility/as_lvalue.h @@ -10,14 +10,14 @@ #ifndef _LIBCPP___UTILITY_AS_LVALUE_H #define _LIBCPP___UTILITY_AS_LVALUE_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__utility/auto_cast.h b/libcxx/include/__cxx03/__utility/auto_cast.h index 06715b3438f99..9932382c63063 100644 --- a/libcxx/include/__cxx03/__utility/auto_cast.h +++ b/libcxx/include/__cxx03/__utility/auto_cast.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___UTILITY_AUTO_CAST_H #define _LIBCPP___UTILITY_AUTO_CAST_H -#include <__config> -#include <__type_traits/decay.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/decay.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/cmp.h b/libcxx/include/__cxx03/__utility/cmp.h index b7c1ed614dfcb..512a31857642a 100644 --- a/libcxx/include/__cxx03/__utility/cmp.h +++ b/libcxx/include/__cxx03/__utility/cmp.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___UTILITY_CMP_H #define _LIBCPP___UTILITY_CMP_H -#include <__concepts/arithmetic.h> -#include <__config> -#include <__type_traits/is_signed.h> -#include <__type_traits/make_unsigned.h> -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/limits> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__utility/convert_to_integral.h b/libcxx/include/__cxx03/__utility/convert_to_integral.h index f1fcdd98010cc..b350d0fd9d934 100644 --- a/libcxx/include/__cxx03/__utility/convert_to_integral.h +++ b/libcxx/include/__cxx03/__utility/convert_to_integral.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___UTILITY_CONVERT_TO_INTEGRAL_H #define _LIBCPP___UTILITY_CONVERT_TO_INTEGRAL_H -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_enum.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/underlying_type.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_enum.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/underlying_type.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/declval.h b/libcxx/include/__cxx03/__utility/declval.h index d0856b8afa4db..f8c2c2ac4287a 100644 --- a/libcxx/include/__cxx03/__utility/declval.h +++ b/libcxx/include/__cxx03/__utility/declval.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___UTILITY_DECLVAL_H #define _LIBCPP___UTILITY_DECLVAL_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/empty.h b/libcxx/include/__cxx03/__utility/empty.h index 8cca197145c72..bd169acabc489 100644 --- a/libcxx/include/__cxx03/__utility/empty.h +++ b/libcxx/include/__cxx03/__utility/empty.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___UTILITY_EMPTY_H #define _LIBCPP___UTILITY_EMPTY_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/exception_guard.h b/libcxx/include/__cxx03/__utility/exception_guard.h index 9f732ca265c86..ec4c58fca0ff6 100644 --- a/libcxx/include/__cxx03/__utility/exception_guard.h +++ b/libcxx/include/__cxx03/__utility/exception_guard.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___UTILITY_TRANSACTION_H #define _LIBCPP___UTILITY_TRANSACTION_H -#include <__assert> -#include <__config> -#include <__type_traits/is_nothrow_constructible.h> -#include <__utility/exchange.h> -#include <__utility/move.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__utility/exchange.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__utility/exchange.h b/libcxx/include/__cxx03/__utility/exchange.h index 957e9d0acaa65..258d4c4ce4e9e 100644 --- a/libcxx/include/__cxx03/__utility/exchange.h +++ b/libcxx/include/__cxx03/__utility/exchange.h @@ -9,18 +9,18 @@ #ifndef _LIBCPP___UTILITY_EXCHANGE_H #define _LIBCPP___UTILITY_EXCHANGE_H -#include <__config> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__utility/forward.h> -#include <__utility/move.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__utility/forward.h b/libcxx/include/__cxx03/__utility/forward.h index d5275dcbd0edc..ac63e2cafd3d8 100644 --- a/libcxx/include/__cxx03/__utility/forward.h +++ b/libcxx/include/__cxx03/__utility/forward.h @@ -10,9 +10,9 @@ #ifndef _LIBCPP___UTILITY_FORWARD_H #define _LIBCPP___UTILITY_FORWARD_H -#include <__config> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/forward_like.h b/libcxx/include/__cxx03/__utility/forward_like.h index 0206ce23a5668..ed5cbee0be0c4 100644 --- a/libcxx/include/__cxx03/__utility/forward_like.h +++ b/libcxx/include/__cxx03/__utility/forward_like.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP___UTILITY_FORWARD_LIKE_H #define _LIBCPP___UTILITY_FORWARD_LIKE_H -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/in_place.h b/libcxx/include/__cxx03/__utility/in_place.h index fa7a2f4bfd4a9..b0c257dec4652 100644 --- a/libcxx/include/__cxx03/__utility/in_place.h +++ b/libcxx/include/__cxx03/__utility/in_place.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___UTILITY_IN_PLACE_H #define _LIBCPP___UTILITY_IN_PLACE_H -#include <__config> -#include <__type_traits/remove_cvref.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/integer_sequence.h b/libcxx/include/__cxx03/__utility/integer_sequence.h index ccce9433e7a80..5f3d1a7630018 100644 --- a/libcxx/include/__cxx03/__utility/integer_sequence.h +++ b/libcxx/include/__cxx03/__utility/integer_sequence.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___UTILITY_INTEGER_SEQUENCE_H #define _LIBCPP___UTILITY_INTEGER_SEQUENCE_H -#include <__config> -#include <__type_traits/is_integral.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/is_pointer_in_range.h b/libcxx/include/__cxx03/__utility/is_pointer_in_range.h index 4130b4ac70700..8773c526a0a3f 100644 --- a/libcxx/include/__cxx03/__utility/is_pointer_in_range.h +++ b/libcxx/include/__cxx03/__utility/is_pointer_in_range.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___UTILITY_IS_POINTER_IN_RANGE_H #define _LIBCPP___UTILITY_IS_POINTER_IN_RANGE_H -#include <__algorithm/comp.h> -#include <__assert> -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include <__utility/is_valid_range.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/is_valid_range.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/is_valid_range.h b/libcxx/include/__cxx03/__utility/is_valid_range.h index 7286662dbf309..a7027bd582cb7 100644 --- a/libcxx/include/__cxx03/__utility/is_valid_range.h +++ b/libcxx/include/__cxx03/__utility/is_valid_range.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___UTILITY_IS_VALID_RANGE_H #define _LIBCPP___UTILITY_IS_VALID_RANGE_H -#include <__algorithm/comp.h> -#include <__config> -#include <__type_traits/is_constant_evaluated.h> +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_constant_evaluated.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/move.h b/libcxx/include/__cxx03/__utility/move.h index b6a42db0545e2..89fa877161e65 100644 --- a/libcxx/include/__cxx03/__utility/move.h +++ b/libcxx/include/__cxx03/__utility/move.h @@ -10,18 +10,18 @@ #ifndef _LIBCPP___UTILITY_MOVE_H #define _LIBCPP___UTILITY_MOVE_H -#include <__config> -#include <__type_traits/conditional.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/remove_reference.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/remove_reference.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__utility/no_destroy.h b/libcxx/include/__cxx03/__utility/no_destroy.h index 8edd194577d7c..14ac437ecf713 100644 --- a/libcxx/include/__cxx03/__utility/no_destroy.h +++ b/libcxx/include/__cxx03/__utility/no_destroy.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___UTILITY_NO_DESTROY_H #define _LIBCPP___UTILITY_NO_DESTROY_H -#include <__config> -#include <__type_traits/is_constant_evaluated.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/pair.h b/libcxx/include/__cxx03/__utility/pair.h index c0002b7abb3ca..175976a8f627b 100644 --- a/libcxx/include/__cxx03/__utility/pair.h +++ b/libcxx/include/__cxx03/__utility/pair.h @@ -9,48 +9,48 @@ #ifndef _LIBCPP___UTILITY_PAIR_H #define _LIBCPP___UTILITY_PAIR_H -#include <__compare/common_comparison_category.h> -#include <__compare/synth_three_way.h> -#include <__concepts/different_from.h> -#include <__config> -#include <__fwd/array.h> -#include <__fwd/pair.h> -#include <__fwd/tuple.h> -#include <__tuple/sfinae_helpers.h> -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_indices.h> -#include <__tuple/tuple_like_no_subrange.h> -#include <__tuple/tuple_size.h> -#include <__type_traits/common_reference.h> -#include <__type_traits/common_type.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_implicitly_default_constructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/nat.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/unwrap_ref.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/piecewise_construct.h> -#include +#include <__cxx03/__compare/common_comparison_category.h> +#include <__cxx03/__compare/synth_three_way.h> +#include <__cxx03/__concepts/different_from.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/array.h> +#include <__cxx03/__fwd/pair.h> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/sfinae_helpers.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_indices.h> +#include <__cxx03/__tuple/tuple_like_no_subrange.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__type_traits/common_reference.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_implicitly_default_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/nat.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/unwrap_ref.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/piecewise_construct.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__utility/piecewise_construct.h b/libcxx/include/__cxx03/__utility/piecewise_construct.h index 52b19d791e100..10ad333f81f14 100644 --- a/libcxx/include/__cxx03/__utility/piecewise_construct.h +++ b/libcxx/include/__cxx03/__utility/piecewise_construct.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___UTILITY_PIECEWISE_CONSTRUCT_H #define _LIBCPP___UTILITY_PIECEWISE_CONSTRUCT_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/priority_tag.h b/libcxx/include/__cxx03/__utility/priority_tag.h index a159ce7f1afb3..8d8cf0e70056f 100644 --- a/libcxx/include/__cxx03/__utility/priority_tag.h +++ b/libcxx/include/__cxx03/__utility/priority_tag.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___UTILITY_PRIORITY_TAG_H #define _LIBCPP___UTILITY_PRIORITY_TAG_H -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/private_constructor_tag.h b/libcxx/include/__cxx03/__utility/private_constructor_tag.h index 462cab48c9edd..c2198dbdd420f 100644 --- a/libcxx/include/__cxx03/__utility/private_constructor_tag.h +++ b/libcxx/include/__cxx03/__utility/private_constructor_tag.h @@ -10,7 +10,7 @@ #ifndef _LIBCPP__UTILITY_PRIVATE_CONSTRUCTOR_TAG_H #define _LIBCPP__UTILITY_PRIVATE_CONSTRUCTOR_TAG_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/rel_ops.h b/libcxx/include/__cxx03/__utility/rel_ops.h index a8caf5bdeaf27..f066a816fcfb9 100644 --- a/libcxx/include/__cxx03/__utility/rel_ops.h +++ b/libcxx/include/__cxx03/__utility/rel_ops.h @@ -9,7 +9,7 @@ #ifndef _LIBCPP___UTILITY_REL_OPS_H #define _LIBCPP___UTILITY_REL_OPS_H -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/small_buffer.h b/libcxx/include/__cxx03/__utility/small_buffer.h index 9e13797573d2d..4836a739ed085 100644 --- a/libcxx/include/__cxx03/__utility/small_buffer.h +++ b/libcxx/include/__cxx03/__utility/small_buffer.h @@ -9,15 +9,15 @@ #ifndef _LIBCPP___UTILITY_SMALL_BUFFER_H #define _LIBCPP___UTILITY_SMALL_BUFFER_H -#include <__config> -#include <__memory/construct_at.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_trivially_constructible.h> -#include <__type_traits/is_trivially_destructible.h> -#include <__utility/exception_guard.h> -#include <__utility/forward.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_trivially_constructible.h> +#include <__cxx03/__type_traits/is_trivially_destructible.h> +#include <__cxx03/__utility/exception_guard.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> +#include <__cxx03/new> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/swap.h b/libcxx/include/__cxx03/__utility/swap.h index ab88b8e0a0b53..6ad367509635c 100644 --- a/libcxx/include/__cxx03/__utility/swap.h +++ b/libcxx/include/__cxx03/__utility/swap.h @@ -9,22 +9,22 @@ #ifndef _LIBCPP___UTILITY_SWAP_H #define _LIBCPP___UTILITY_SWAP_H -#include <__config> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_swappable.h> -#include <__utility/declval.h> -#include <__utility/move.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/__utility/to_underlying.h b/libcxx/include/__cxx03/__utility/to_underlying.h index 77587108f20dc..8d33de1714c72 100644 --- a/libcxx/include/__cxx03/__utility/to_underlying.h +++ b/libcxx/include/__cxx03/__utility/to_underlying.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP___UTILITY_TO_UNDERLYING_H #define _LIBCPP___UTILITY_TO_UNDERLYING_H -#include <__config> -#include <__type_traits/underlying_type.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/underlying_type.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__utility/unreachable.h b/libcxx/include/__cxx03/__utility/unreachable.h index d833f74c2e4f1..fb4a775d02440 100644 --- a/libcxx/include/__cxx03/__utility/unreachable.h +++ b/libcxx/include/__cxx03/__utility/unreachable.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___UTILITY_UNREACHABLE_H #define _LIBCPP___UTILITY_UNREACHABLE_H -#include <__assert> -#include <__config> +#include <__cxx03/__assert> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__variant/monostate.h b/libcxx/include/__cxx03/__variant/monostate.h index 16f156609eb7d..6a136e75516b6 100644 --- a/libcxx/include/__cxx03/__variant/monostate.h +++ b/libcxx/include/__cxx03/__variant/monostate.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP___VARIANT_MONOSTATE_H #define _LIBCPP___VARIANT_MONOSTATE_H -#include <__compare/ordering.h> -#include <__config> -#include <__functional/hash.h> -#include +#include <__cxx03/__compare/ordering.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/__verbose_abort b/libcxx/include/__cxx03/__verbose_abort index 195ce65b721ff..4eb994a4f2043 100644 --- a/libcxx/include/__cxx03/__verbose_abort +++ b/libcxx/include/__cxx03/__verbose_abort @@ -10,7 +10,7 @@ #ifndef _LIBCPP___VERBOSE_ABORT #define _LIBCPP___VERBOSE_ABORT -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/algorithm b/libcxx/include/__cxx03/algorithm index 698e6f5cb7ad1..929e2c5b7422a 100644 --- a/libcxx/include/__cxx03/algorithm +++ b/libcxx/include/__cxx03/algorithm @@ -13,7 +13,7 @@ /* algorithm synopsis -#include +#include <__cxx03/initializer_list> namespace std { @@ -1818,232 +1818,232 @@ template */ -#include <__config> - -#include <__algorithm/adjacent_find.h> -#include <__algorithm/all_of.h> -#include <__algorithm/any_of.h> -#include <__algorithm/binary_search.h> -#include <__algorithm/copy.h> -#include <__algorithm/copy_backward.h> -#include <__algorithm/copy_if.h> -#include <__algorithm/copy_n.h> -#include <__algorithm/count.h> -#include <__algorithm/count_if.h> -#include <__algorithm/equal.h> -#include <__algorithm/equal_range.h> -#include <__algorithm/fill.h> -#include <__algorithm/fill_n.h> -#include <__algorithm/find.h> -#include <__algorithm/find_end.h> -#include <__algorithm/find_first_of.h> -#include <__algorithm/find_if.h> -#include <__algorithm/find_if_not.h> -#include <__algorithm/for_each.h> -#include <__algorithm/generate.h> -#include <__algorithm/generate_n.h> -#include <__algorithm/includes.h> -#include <__algorithm/inplace_merge.h> -#include <__algorithm/is_heap.h> -#include <__algorithm/is_heap_until.h> -#include <__algorithm/is_partitioned.h> -#include <__algorithm/is_permutation.h> -#include <__algorithm/is_sorted.h> -#include <__algorithm/is_sorted_until.h> -#include <__algorithm/iter_swap.h> -#include <__algorithm/lexicographical_compare.h> -#include <__algorithm/lower_bound.h> -#include <__algorithm/make_heap.h> -#include <__algorithm/max.h> -#include <__algorithm/max_element.h> -#include <__algorithm/merge.h> -#include <__algorithm/min.h> -#include <__algorithm/min_element.h> -#include <__algorithm/minmax.h> -#include <__algorithm/minmax_element.h> -#include <__algorithm/mismatch.h> -#include <__algorithm/move.h> -#include <__algorithm/move_backward.h> -#include <__algorithm/next_permutation.h> -#include <__algorithm/none_of.h> -#include <__algorithm/nth_element.h> -#include <__algorithm/partial_sort.h> -#include <__algorithm/partial_sort_copy.h> -#include <__algorithm/partition.h> -#include <__algorithm/partition_copy.h> -#include <__algorithm/partition_point.h> -#include <__algorithm/pop_heap.h> -#include <__algorithm/prev_permutation.h> -#include <__algorithm/push_heap.h> -#include <__algorithm/remove.h> -#include <__algorithm/remove_copy.h> -#include <__algorithm/remove_copy_if.h> -#include <__algorithm/remove_if.h> -#include <__algorithm/replace.h> -#include <__algorithm/replace_copy.h> -#include <__algorithm/replace_copy_if.h> -#include <__algorithm/replace_if.h> -#include <__algorithm/reverse.h> -#include <__algorithm/reverse_copy.h> -#include <__algorithm/rotate.h> -#include <__algorithm/rotate_copy.h> -#include <__algorithm/search.h> -#include <__algorithm/search_n.h> -#include <__algorithm/set_difference.h> -#include <__algorithm/set_intersection.h> -#include <__algorithm/set_symmetric_difference.h> -#include <__algorithm/set_union.h> -#include <__algorithm/shuffle.h> -#include <__algorithm/sort.h> -#include <__algorithm/sort_heap.h> -#include <__algorithm/stable_partition.h> -#include <__algorithm/stable_sort.h> -#include <__algorithm/swap_ranges.h> -#include <__algorithm/transform.h> -#include <__algorithm/unique.h> -#include <__algorithm/unique_copy.h> -#include <__algorithm/upper_bound.h> +#include <__cxx03/__config> + +#include <__cxx03/__algorithm/adjacent_find.h> +#include <__cxx03/__algorithm/all_of.h> +#include <__cxx03/__algorithm/any_of.h> +#include <__cxx03/__algorithm/binary_search.h> +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/copy_backward.h> +#include <__cxx03/__algorithm/copy_if.h> +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__algorithm/count.h> +#include <__cxx03/__algorithm/count_if.h> +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/equal_range.h> +#include <__cxx03/__algorithm/fill.h> +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__algorithm/find.h> +#include <__cxx03/__algorithm/find_end.h> +#include <__cxx03/__algorithm/find_first_of.h> +#include <__cxx03/__algorithm/find_if.h> +#include <__cxx03/__algorithm/find_if_not.h> +#include <__cxx03/__algorithm/for_each.h> +#include <__cxx03/__algorithm/generate.h> +#include <__cxx03/__algorithm/generate_n.h> +#include <__cxx03/__algorithm/includes.h> +#include <__cxx03/__algorithm/inplace_merge.h> +#include <__cxx03/__algorithm/is_heap.h> +#include <__cxx03/__algorithm/is_heap_until.h> +#include <__cxx03/__algorithm/is_partitioned.h> +#include <__cxx03/__algorithm/is_permutation.h> +#include <__cxx03/__algorithm/is_sorted.h> +#include <__cxx03/__algorithm/is_sorted_until.h> +#include <__cxx03/__algorithm/iter_swap.h> +#include <__cxx03/__algorithm/lexicographical_compare.h> +#include <__cxx03/__algorithm/lower_bound.h> +#include <__cxx03/__algorithm/make_heap.h> +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__algorithm/max_element.h> +#include <__cxx03/__algorithm/merge.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/min_element.h> +#include <__cxx03/__algorithm/minmax.h> +#include <__cxx03/__algorithm/minmax_element.h> +#include <__cxx03/__algorithm/mismatch.h> +#include <__cxx03/__algorithm/move.h> +#include <__cxx03/__algorithm/move_backward.h> +#include <__cxx03/__algorithm/next_permutation.h> +#include <__cxx03/__algorithm/none_of.h> +#include <__cxx03/__algorithm/nth_element.h> +#include <__cxx03/__algorithm/partial_sort.h> +#include <__cxx03/__algorithm/partial_sort_copy.h> +#include <__cxx03/__algorithm/partition.h> +#include <__cxx03/__algorithm/partition_copy.h> +#include <__cxx03/__algorithm/partition_point.h> +#include <__cxx03/__algorithm/pop_heap.h> +#include <__cxx03/__algorithm/prev_permutation.h> +#include <__cxx03/__algorithm/push_heap.h> +#include <__cxx03/__algorithm/remove.h> +#include <__cxx03/__algorithm/remove_copy.h> +#include <__cxx03/__algorithm/remove_copy_if.h> +#include <__cxx03/__algorithm/remove_if.h> +#include <__cxx03/__algorithm/replace.h> +#include <__cxx03/__algorithm/replace_copy.h> +#include <__cxx03/__algorithm/replace_copy_if.h> +#include <__cxx03/__algorithm/replace_if.h> +#include <__cxx03/__algorithm/reverse.h> +#include <__cxx03/__algorithm/reverse_copy.h> +#include <__cxx03/__algorithm/rotate.h> +#include <__cxx03/__algorithm/rotate_copy.h> +#include <__cxx03/__algorithm/search.h> +#include <__cxx03/__algorithm/search_n.h> +#include <__cxx03/__algorithm/set_difference.h> +#include <__cxx03/__algorithm/set_intersection.h> +#include <__cxx03/__algorithm/set_symmetric_difference.h> +#include <__cxx03/__algorithm/set_union.h> +#include <__cxx03/__algorithm/shuffle.h> +#include <__cxx03/__algorithm/sort.h> +#include <__cxx03/__algorithm/sort_heap.h> +#include <__cxx03/__algorithm/stable_partition.h> +#include <__cxx03/__algorithm/stable_sort.h> +#include <__cxx03/__algorithm/swap_ranges.h> +#include <__cxx03/__algorithm/transform.h> +#include <__cxx03/__algorithm/unique.h> +#include <__cxx03/__algorithm/unique_copy.h> +#include <__cxx03/__algorithm/upper_bound.h> #if _LIBCPP_STD_VER >= 17 -# include <__algorithm/clamp.h> -# include <__algorithm/for_each_n.h> -# include <__algorithm/pstl.h> -# include <__algorithm/sample.h> +# include <__cxx03/__algorithm/clamp.h> +# include <__cxx03/__algorithm/for_each_n.h> +# include <__cxx03/__algorithm/pstl.h> +# include <__cxx03/__algorithm/sample.h> #endif // _LIBCPP_STD_VER >= 17 #if _LIBCPP_STD_VER >= 20 -# include <__algorithm/in_found_result.h> -# include <__algorithm/in_fun_result.h> -# include <__algorithm/in_in_out_result.h> -# include <__algorithm/in_in_result.h> -# include <__algorithm/in_out_out_result.h> -# include <__algorithm/in_out_result.h> -# include <__algorithm/lexicographical_compare_three_way.h> -# include <__algorithm/min_max_result.h> -# include <__algorithm/ranges_adjacent_find.h> -# include <__algorithm/ranges_all_of.h> -# include <__algorithm/ranges_any_of.h> -# include <__algorithm/ranges_binary_search.h> -# include <__algorithm/ranges_clamp.h> -# include <__algorithm/ranges_contains.h> -# include <__algorithm/ranges_copy.h> -# include <__algorithm/ranges_copy_backward.h> -# include <__algorithm/ranges_copy_if.h> -# include <__algorithm/ranges_copy_n.h> -# include <__algorithm/ranges_count.h> -# include <__algorithm/ranges_count_if.h> -# include <__algorithm/ranges_equal.h> -# include <__algorithm/ranges_equal_range.h> -# include <__algorithm/ranges_fill.h> -# include <__algorithm/ranges_fill_n.h> -# include <__algorithm/ranges_find.h> -# include <__algorithm/ranges_find_end.h> -# include <__algorithm/ranges_find_first_of.h> -# include <__algorithm/ranges_find_if.h> -# include <__algorithm/ranges_find_if_not.h> -# include <__algorithm/ranges_for_each.h> -# include <__algorithm/ranges_for_each_n.h> -# include <__algorithm/ranges_generate.h> -# include <__algorithm/ranges_generate_n.h> -# include <__algorithm/ranges_includes.h> -# include <__algorithm/ranges_inplace_merge.h> -# include <__algorithm/ranges_is_heap.h> -# include <__algorithm/ranges_is_heap_until.h> -# include <__algorithm/ranges_is_partitioned.h> -# include <__algorithm/ranges_is_permutation.h> -# include <__algorithm/ranges_is_sorted.h> -# include <__algorithm/ranges_is_sorted_until.h> -# include <__algorithm/ranges_lexicographical_compare.h> -# include <__algorithm/ranges_lower_bound.h> -# include <__algorithm/ranges_make_heap.h> -# include <__algorithm/ranges_max.h> -# include <__algorithm/ranges_max_element.h> -# include <__algorithm/ranges_merge.h> -# include <__algorithm/ranges_min.h> -# include <__algorithm/ranges_min_element.h> -# include <__algorithm/ranges_minmax.h> -# include <__algorithm/ranges_minmax_element.h> -# include <__algorithm/ranges_mismatch.h> -# include <__algorithm/ranges_move.h> -# include <__algorithm/ranges_move_backward.h> -# include <__algorithm/ranges_next_permutation.h> -# include <__algorithm/ranges_none_of.h> -# include <__algorithm/ranges_nth_element.h> -# include <__algorithm/ranges_partial_sort.h> -# include <__algorithm/ranges_partial_sort_copy.h> -# include <__algorithm/ranges_partition.h> -# include <__algorithm/ranges_partition_copy.h> -# include <__algorithm/ranges_partition_point.h> -# include <__algorithm/ranges_pop_heap.h> -# include <__algorithm/ranges_prev_permutation.h> -# include <__algorithm/ranges_push_heap.h> -# include <__algorithm/ranges_remove.h> -# include <__algorithm/ranges_remove_copy.h> -# include <__algorithm/ranges_remove_copy_if.h> -# include <__algorithm/ranges_remove_if.h> -# include <__algorithm/ranges_replace.h> -# include <__algorithm/ranges_replace_copy.h> -# include <__algorithm/ranges_replace_copy_if.h> -# include <__algorithm/ranges_replace_if.h> -# include <__algorithm/ranges_reverse.h> -# include <__algorithm/ranges_reverse_copy.h> -# include <__algorithm/ranges_rotate.h> -# include <__algorithm/ranges_rotate_copy.h> -# include <__algorithm/ranges_sample.h> -# include <__algorithm/ranges_search.h> -# include <__algorithm/ranges_search_n.h> -# include <__algorithm/ranges_set_difference.h> -# include <__algorithm/ranges_set_intersection.h> -# include <__algorithm/ranges_set_symmetric_difference.h> -# include <__algorithm/ranges_set_union.h> -# include <__algorithm/ranges_shuffle.h> -# include <__algorithm/ranges_sort.h> -# include <__algorithm/ranges_sort_heap.h> -# include <__algorithm/ranges_stable_partition.h> -# include <__algorithm/ranges_stable_sort.h> -# include <__algorithm/ranges_swap_ranges.h> -# include <__algorithm/ranges_transform.h> -# include <__algorithm/ranges_unique.h> -# include <__algorithm/ranges_unique_copy.h> -# include <__algorithm/ranges_upper_bound.h> -# include <__algorithm/shift_left.h> -# include <__algorithm/shift_right.h> +# include <__cxx03/__algorithm/in_found_result.h> +# include <__cxx03/__algorithm/in_fun_result.h> +# include <__cxx03/__algorithm/in_in_out_result.h> +# include <__cxx03/__algorithm/in_in_result.h> +# include <__cxx03/__algorithm/in_out_out_result.h> +# include <__cxx03/__algorithm/in_out_result.h> +# include <__cxx03/__algorithm/lexicographical_compare_three_way.h> +# include <__cxx03/__algorithm/min_max_result.h> +# include <__cxx03/__algorithm/ranges_adjacent_find.h> +# include <__cxx03/__algorithm/ranges_all_of.h> +# include <__cxx03/__algorithm/ranges_any_of.h> +# include <__cxx03/__algorithm/ranges_binary_search.h> +# include <__cxx03/__algorithm/ranges_clamp.h> +# include <__cxx03/__algorithm/ranges_contains.h> +# include <__cxx03/__algorithm/ranges_copy.h> +# include <__cxx03/__algorithm/ranges_copy_backward.h> +# include <__cxx03/__algorithm/ranges_copy_if.h> +# include <__cxx03/__algorithm/ranges_copy_n.h> +# include <__cxx03/__algorithm/ranges_count.h> +# include <__cxx03/__algorithm/ranges_count_if.h> +# include <__cxx03/__algorithm/ranges_equal.h> +# include <__cxx03/__algorithm/ranges_equal_range.h> +# include <__cxx03/__algorithm/ranges_fill.h> +# include <__cxx03/__algorithm/ranges_fill_n.h> +# include <__cxx03/__algorithm/ranges_find.h> +# include <__cxx03/__algorithm/ranges_find_end.h> +# include <__cxx03/__algorithm/ranges_find_first_of.h> +# include <__cxx03/__algorithm/ranges_find_if.h> +# include <__cxx03/__algorithm/ranges_find_if_not.h> +# include <__cxx03/__algorithm/ranges_for_each.h> +# include <__cxx03/__algorithm/ranges_for_each_n.h> +# include <__cxx03/__algorithm/ranges_generate.h> +# include <__cxx03/__algorithm/ranges_generate_n.h> +# include <__cxx03/__algorithm/ranges_includes.h> +# include <__cxx03/__algorithm/ranges_inplace_merge.h> +# include <__cxx03/__algorithm/ranges_is_heap.h> +# include <__cxx03/__algorithm/ranges_is_heap_until.h> +# include <__cxx03/__algorithm/ranges_is_partitioned.h> +# include <__cxx03/__algorithm/ranges_is_permutation.h> +# include <__cxx03/__algorithm/ranges_is_sorted.h> +# include <__cxx03/__algorithm/ranges_is_sorted_until.h> +# include <__cxx03/__algorithm/ranges_lexicographical_compare.h> +# include <__cxx03/__algorithm/ranges_lower_bound.h> +# include <__cxx03/__algorithm/ranges_make_heap.h> +# include <__cxx03/__algorithm/ranges_max.h> +# include <__cxx03/__algorithm/ranges_max_element.h> +# include <__cxx03/__algorithm/ranges_merge.h> +# include <__cxx03/__algorithm/ranges_min.h> +# include <__cxx03/__algorithm/ranges_min_element.h> +# include <__cxx03/__algorithm/ranges_minmax.h> +# include <__cxx03/__algorithm/ranges_minmax_element.h> +# include <__cxx03/__algorithm/ranges_mismatch.h> +# include <__cxx03/__algorithm/ranges_move.h> +# include <__cxx03/__algorithm/ranges_move_backward.h> +# include <__cxx03/__algorithm/ranges_next_permutation.h> +# include <__cxx03/__algorithm/ranges_none_of.h> +# include <__cxx03/__algorithm/ranges_nth_element.h> +# include <__cxx03/__algorithm/ranges_partial_sort.h> +# include <__cxx03/__algorithm/ranges_partial_sort_copy.h> +# include <__cxx03/__algorithm/ranges_partition.h> +# include <__cxx03/__algorithm/ranges_partition_copy.h> +# include <__cxx03/__algorithm/ranges_partition_point.h> +# include <__cxx03/__algorithm/ranges_pop_heap.h> +# include <__cxx03/__algorithm/ranges_prev_permutation.h> +# include <__cxx03/__algorithm/ranges_push_heap.h> +# include <__cxx03/__algorithm/ranges_remove.h> +# include <__cxx03/__algorithm/ranges_remove_copy.h> +# include <__cxx03/__algorithm/ranges_remove_copy_if.h> +# include <__cxx03/__algorithm/ranges_remove_if.h> +# include <__cxx03/__algorithm/ranges_replace.h> +# include <__cxx03/__algorithm/ranges_replace_copy.h> +# include <__cxx03/__algorithm/ranges_replace_copy_if.h> +# include <__cxx03/__algorithm/ranges_replace_if.h> +# include <__cxx03/__algorithm/ranges_reverse.h> +# include <__cxx03/__algorithm/ranges_reverse_copy.h> +# include <__cxx03/__algorithm/ranges_rotate.h> +# include <__cxx03/__algorithm/ranges_rotate_copy.h> +# include <__cxx03/__algorithm/ranges_sample.h> +# include <__cxx03/__algorithm/ranges_search.h> +# include <__cxx03/__algorithm/ranges_search_n.h> +# include <__cxx03/__algorithm/ranges_set_difference.h> +# include <__cxx03/__algorithm/ranges_set_intersection.h> +# include <__cxx03/__algorithm/ranges_set_symmetric_difference.h> +# include <__cxx03/__algorithm/ranges_set_union.h> +# include <__cxx03/__algorithm/ranges_shuffle.h> +# include <__cxx03/__algorithm/ranges_sort.h> +# include <__cxx03/__algorithm/ranges_sort_heap.h> +# include <__cxx03/__algorithm/ranges_stable_partition.h> +# include <__cxx03/__algorithm/ranges_stable_sort.h> +# include <__cxx03/__algorithm/ranges_swap_ranges.h> +# include <__cxx03/__algorithm/ranges_transform.h> +# include <__cxx03/__algorithm/ranges_unique.h> +# include <__cxx03/__algorithm/ranges_unique_copy.h> +# include <__cxx03/__algorithm/ranges_upper_bound.h> +# include <__cxx03/__algorithm/shift_left.h> +# include <__cxx03/__algorithm/shift_right.h> #endif #if _LIBCPP_STD_VER >= 23 -# include <__algorithm/fold.h> -# include <__algorithm/ranges_contains_subrange.h> -# include <__algorithm/ranges_ends_with.h> -# include <__algorithm/ranges_find_last.h> -# include <__algorithm/ranges_starts_with.h> +# include <__cxx03/__algorithm/fold.h> +# include <__cxx03/__algorithm/ranges_contains_subrange.h> +# include <__cxx03/__algorithm/ranges_ends_with.h> +# include <__cxx03/__algorithm/ranges_find_last.h> +# include <__cxx03/__algorithm/ranges_starts_with.h> #endif // _LIBCPP_STD_VER >= 23 -#include +#include <__cxx03/version> // standard-mandated includes // [algorithm.syn] -#include +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER == 14 -# include +# include <__cxx03/execution> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/bit> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/iterator> +# include <__cxx03/memory> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> +# include <__cxx03/utility> #endif #endif // _LIBCPP_ALGORITHM diff --git a/libcxx/include/__cxx03/any b/libcxx/include/__cxx03/any index 5def14dc87e6b..06ce9336abbdf 100644 --- a/libcxx/include/__cxx03/any +++ b/libcxx/include/__cxx03/any @@ -80,40 +80,40 @@ namespace std { */ -#include <__config> -#include <__memory/allocator.h> -#include <__memory/allocator_destructor.h> -#include <__memory/allocator_traits.h> -#include <__memory/unique_ptr.h> -#include <__type_traits/add_const.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/aligned_storage.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_void.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> -#include <__utility/unreachable.h> -#include <__verbose_abort> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/allocator_destructor.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__type_traits/add_const.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/aligned_storage.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/initializer_list> +#include <__cxx03/typeinfo> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> namespace std { class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_BAD_ANY_CAST bad_any_cast : public bad_cast { @@ -596,19 +596,19 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include +# include <__cxx03/chrono> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/iosfwd> +# include <__cxx03/iterator> +# include <__cxx03/memory> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> +# include <__cxx03/variant> #endif #endif // _LIBCPP_ANY diff --git a/libcxx/include/__cxx03/array b/libcxx/include/__cxx03/array index b8ad288524389..2686dd85668b6 100644 --- a/libcxx/include/__cxx03/array +++ b/libcxx/include/__cxx03/array @@ -111,57 +111,57 @@ template const T&& get(const array&&) noexce */ -#include <__algorithm/equal.h> -#include <__algorithm/fill_n.h> -#include <__algorithm/lexicographical_compare.h> -#include <__algorithm/lexicographical_compare_three_way.h> -#include <__algorithm/swap_ranges.h> -#include <__assert> -#include <__config> -#include <__fwd/array.h> -#include <__iterator/reverse_iterator.h> -#include <__iterator/wrap_iter.h> -#include <__tuple/sfinae_helpers.h> -#include <__type_traits/conditional.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/remove_cv.h> -#include <__utility/empty.h> -#include <__utility/integer_sequence.h> -#include <__utility/move.h> -#include <__utility/unreachable.h> -#include -#include +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__algorithm/lexicographical_compare.h> +#include <__cxx03/__algorithm/lexicographical_compare_three_way.h> +#include <__cxx03/__algorithm/swap_ranges.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__fwd/array.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__iterator/wrap_iter.h> +#include <__cxx03/__tuple/sfinae_helpers.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__utility/empty.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/stdexcept> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [array.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> // [tuple.helper] -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_size.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_size.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -503,12 +503,12 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/iterator> +# include <__cxx03/type_traits> +# include <__cxx03/utility> #endif #endif // _LIBCPP_ARRAY diff --git a/libcxx/include/__cxx03/atomic b/libcxx/include/__cxx03/atomic index 0d13619d6ce45..7a0b5bd38fb94 100644 --- a/libcxx/include/__cxx03/atomic +++ b/libcxx/include/__cxx03/atomic @@ -587,30 +587,30 @@ template */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER < 23 && defined(_LIBCPP_STDATOMIC_H) # error is incompatible with before C++23. Please compile with -std=c++23. #endif -#include <__atomic/aliases.h> -#include <__atomic/atomic.h> -#include <__atomic/atomic_base.h> -#include <__atomic/atomic_flag.h> -#include <__atomic/atomic_init.h> -#include <__atomic/atomic_lock_free.h> -#include <__atomic/atomic_sync.h> -#include <__atomic/check_memory_order.h> -#include <__atomic/contention_t.h> -#include <__atomic/cxx_atomic_impl.h> -#include <__atomic/fence.h> -#include <__atomic/is_always_lock_free.h> -#include <__atomic/kill_dependency.h> -#include <__atomic/memory_order.h> -#include +#include <__cxx03/__atomic/aliases.h> +#include <__cxx03/__atomic/atomic.h> +#include <__cxx03/__atomic/atomic_base.h> +#include <__cxx03/__atomic/atomic_flag.h> +#include <__cxx03/__atomic/atomic_init.h> +#include <__cxx03/__atomic/atomic_lock_free.h> +#include <__cxx03/__atomic/atomic_sync.h> +#include <__cxx03/__atomic/check_memory_order.h> +#include <__cxx03/__atomic/contention_t.h> +#include <__cxx03/__atomic/cxx_atomic_impl.h> +#include <__cxx03/__atomic/fence.h> +#include <__cxx03/__atomic/is_always_lock_free.h> +#include <__cxx03/__atomic/kill_dependency.h> +#include <__cxx03/__atomic/memory_order.h> +#include <__cxx03/version> #if _LIBCPP_STD_VER >= 20 -# include <__atomic/atomic_ref.h> +# include <__cxx03/__atomic/atomic_ref.h> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -622,11 +622,11 @@ template #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include +# include <__cxx03/cmath> +# include <__cxx03/compare> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_ATOMIC diff --git a/libcxx/include/__cxx03/barrier b/libcxx/include/__cxx03/barrier index edee181273e24..5aee9e7210ec4 100644 --- a/libcxx/include/__cxx03/barrier +++ b/libcxx/include/__cxx03/barrier @@ -45,28 +45,28 @@ namespace std */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include <__assert> -# include <__atomic/atomic_base.h> -# include <__atomic/memory_order.h> -# include <__memory/unique_ptr.h> -# include <__thread/poll_with_backoff.h> -# include <__thread/timed_backoff_policy.h> -# include <__utility/move.h> -# include -# include -# include -# include +# include <__cxx03/__assert> +# include <__cxx03/__atomic/atomic_base.h> +# include <__cxx03/__atomic/memory_order.h> +# include <__cxx03/__memory/unique_ptr.h> +# include <__cxx03/__thread/poll_with_backoff.h> +# include <__cxx03/__thread/timed_backoff_policy.h> +# include <__cxx03/__utility/move.h> +# include <__cxx03/cstddef> +# include <__cxx03/cstdint> +# include <__cxx03/limits> +# include <__cxx03/version> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> # if _LIBCPP_STD_VER >= 14 @@ -297,12 +297,12 @@ _LIBCPP_POP_MACROS #endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/iterator> +# include <__cxx03/memory> +# include <__cxx03/stdexcept> +# include <__cxx03/variant> #endif #endif //_LIBCPP_BARRIER diff --git a/libcxx/include/__cxx03/bit b/libcxx/include/__cxx03/bit index 94387d101a398..e220eedfeaa92 100644 --- a/libcxx/include/__cxx03/bit +++ b/libcxx/include/__cxx03/bit @@ -61,41 +61,41 @@ namespace std { */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 20 -# include <__bit/bit_cast.h> -# include <__bit/bit_ceil.h> -# include <__bit/bit_floor.h> -# include <__bit/bit_log2.h> -# include <__bit/bit_width.h> -# include <__bit/countl.h> -# include <__bit/countr.h> -# include <__bit/endian.h> -# include <__bit/has_single_bit.h> -# include <__bit/popcount.h> -# include <__bit/rotate.h> +# include <__cxx03/__bit/bit_cast.h> +# include <__cxx03/__bit/bit_ceil.h> +# include <__cxx03/__bit/bit_floor.h> +# include <__cxx03/__bit/bit_log2.h> +# include <__cxx03/__bit/bit_width.h> +# include <__cxx03/__bit/countl.h> +# include <__cxx03/__bit/countr.h> +# include <__cxx03/__bit/endian.h> +# include <__cxx03/__bit/has_single_bit.h> +# include <__cxx03/__bit/popcount.h> +# include <__cxx03/__bit/rotate.h> #endif #if _LIBCPP_STD_VER >= 23 -# include <__bit/byteswap.h> +# include <__cxx03/__bit/byteswap.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include +# include <__cxx03/cstdint> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include +# include <__cxx03/cstdlib> +# include <__cxx03/iosfwd> +# include <__cxx03/limits> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_BIT diff --git a/libcxx/include/__cxx03/bitset b/libcxx/include/__cxx03/bitset index 6bd7bfe585f38..f41d78ce27f08 100644 --- a/libcxx/include/__cxx03/bitset +++ b/libcxx/include/__cxx03/bitset @@ -126,32 +126,32 @@ template struct hash>; // clang-format on -#include <__algorithm/count.h> -#include <__algorithm/fill.h> -#include <__algorithm/find.h> -#include <__bit_reference> -#include <__config> -#include <__functional/hash.h> -#include <__functional/unary_function.h> -#include <__type_traits/is_char_like_type.h> -#include -#include -#include -#include -#include +#include <__cxx03/__algorithm/count.h> +#include <__cxx03/__algorithm/fill.h> +#include <__cxx03/__algorithm/find.h> +#include <__cxx03/__bit_reference> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__type_traits/is_char_like_type.h> +#include <__cxx03/climits> +#include <__cxx03/cstddef> +#include <__cxx03/stdexcept> +#include <__cxx03/string_view> +#include <__cxx03/version> // standard-mandated includes // [bitset.syn] -#include -#include +#include <__cxx03/iosfwd> +#include <__cxx03/string> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -961,9 +961,9 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_BITSET diff --git a/libcxx/include/__cxx03/cassert b/libcxx/include/__cxx03/cassert index 6fec37dc63761..a3695b32a0511 100644 --- a/libcxx/include/__cxx03/cassert +++ b/libcxx/include/__cxx03/cassert @@ -16,7 +16,7 @@ Macros: */ -#include <__config> +#include <__cxx03/__config> // is not provided by libc++ #if __has_include() diff --git a/libcxx/include/__cxx03/ccomplex b/libcxx/include/__cxx03/ccomplex index 94d2c8d7d003d..fb16b8fbb54af 100644 --- a/libcxx/include/__cxx03/ccomplex +++ b/libcxx/include/__cxx03/ccomplex @@ -13,11 +13,11 @@ /* ccomplex synopsis -#include +#include <__cxx03/complex> */ -#include +#include <__cxx03/complex> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/cctype b/libcxx/include/__cxx03/cctype index d7af7e084aa23..1bfb66b704a27 100644 --- a/libcxx/include/__cxx03/cctype +++ b/libcxx/include/__cxx03/cctype @@ -34,9 +34,9 @@ int toupper(int c); } // std */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/ctype.h> #ifndef _LIBCPP_CTYPE_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cerrno b/libcxx/include/__cxx03/cerrno index f1295680fed6c..987e7492ce05a 100644 --- a/libcxx/include/__cxx03/cerrno +++ b/libcxx/include/__cxx03/cerrno @@ -22,9 +22,9 @@ Macros: */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/errno.h> #ifndef _LIBCPP_ERRNO_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cfenv b/libcxx/include/__cxx03/cfenv index f8cacd562f76b..59c56d29a7c4a 100644 --- a/libcxx/include/__cxx03/cfenv +++ b/libcxx/include/__cxx03/cfenv @@ -52,9 +52,9 @@ int feupdateenv(const fenv_t* envp); } // std */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/fenv.h> #ifndef _LIBCPP_FENV_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cfloat b/libcxx/include/__cxx03/cfloat index 5d1b38c557dca..5f65b014ca121 100644 --- a/libcxx/include/__cxx03/cfloat +++ b/libcxx/include/__cxx03/cfloat @@ -69,9 +69,9 @@ Macros: LDBL_TRUE_MIN // C11 */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/float.h> #ifndef _LIBCPP_FLOAT_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/charconv b/libcxx/include/__cxx03/charconv index a2e270e9316dc..0135a74fba8a5 100644 --- a/libcxx/include/__cxx03/charconv +++ b/libcxx/include/__cxx03/charconv @@ -69,22 +69,22 @@ namespace std { */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 17 -# include <__charconv/chars_format.h> -# include <__charconv/from_chars_integral.h> -# include <__charconv/from_chars_result.h> -# include <__charconv/tables.h> -# include <__charconv/to_chars.h> -# include <__charconv/to_chars_base_10.h> -# include <__charconv/to_chars_floating_point.h> -# include <__charconv/to_chars_integral.h> -# include <__charconv/to_chars_result.h> -# include <__charconv/traits.h> +# include <__cxx03/__charconv/chars_format.h> +# include <__cxx03/__charconv/from_chars_integral.h> +# include <__cxx03/__charconv/from_chars_result.h> +# include <__cxx03/__charconv/tables.h> +# include <__cxx03/__charconv/to_chars.h> +# include <__cxx03/__charconv/to_chars_base_10.h> +# include <__cxx03/__charconv/to_chars_floating_point.h> +# include <__cxx03/__charconv/to_chars_integral.h> +# include <__cxx03/__charconv/to_chars_result.h> +# include <__cxx03/__charconv/traits.h> #endif // _LIBCPP_STD_VER >= 17 -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -95,21 +95,21 @@ _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 -# include -# include -# include -# include +# include <__cxx03/cerrno> +# include <__cxx03/cstddef> +# include <__cxx03/initializer_list> +# include <__cxx03/new> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/cmath> +# include <__cxx03/concepts> +# include <__cxx03/cstdint> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/iosfwd> +# include <__cxx03/limits> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_CHARCONV diff --git a/libcxx/include/__cxx03/chrono b/libcxx/include/__cxx03/chrono index 7bec5e5a26ef4..f39abf1897df9 100644 --- a/libcxx/include/__cxx03/chrono +++ b/libcxx/include/__cxx03/chrono @@ -15,7 +15,7 @@ /* chrono synopsis -#include // C++20 +#include <__cxx03/compare> // C++20 namespace std { @@ -939,83 +939,83 @@ constexpr chrono::year operator ""y(unsigned lo // clang-format on -#include <__config> +#include <__cxx03/__config> -#include <__chrono/duration.h> -#include <__chrono/file_clock.h> -#include <__chrono/high_resolution_clock.h> -#include <__chrono/steady_clock.h> -#include <__chrono/system_clock.h> -#include <__chrono/time_point.h> +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/file_clock.h> +#include <__cxx03/__chrono/high_resolution_clock.h> +#include <__cxx03/__chrono/steady_clock.h> +#include <__cxx03/__chrono/system_clock.h> +#include <__cxx03/__chrono/time_point.h> #if _LIBCPP_STD_VER >= 20 -# include <__chrono/calendar.h> -# include <__chrono/day.h> -# include <__chrono/exception.h> -# include <__chrono/hh_mm_ss.h> -# include <__chrono/literals.h> -# include <__chrono/local_info.h> -# include <__chrono/month.h> -# include <__chrono/month_weekday.h> -# include <__chrono/monthday.h> -# include <__chrono/sys_info.h> -# include <__chrono/weekday.h> -# include <__chrono/year.h> -# include <__chrono/year_month.h> -# include <__chrono/year_month_day.h> -# include <__chrono/year_month_weekday.h> +# include <__cxx03/__chrono/calendar.h> +# include <__cxx03/__chrono/day.h> +# include <__cxx03/__chrono/exception.h> +# include <__cxx03/__chrono/hh_mm_ss.h> +# include <__cxx03/__chrono/literals.h> +# include <__cxx03/__chrono/local_info.h> +# include <__cxx03/__chrono/month.h> +# include <__cxx03/__chrono/month_weekday.h> +# include <__cxx03/__chrono/monthday.h> +# include <__cxx03/__chrono/sys_info.h> +# include <__cxx03/__chrono/weekday.h> +# include <__cxx03/__chrono/year.h> +# include <__cxx03/__chrono/year_month.h> +# include <__cxx03/__chrono/year_month_day.h> +# include <__cxx03/__chrono/year_month_weekday.h> # if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include <__chrono/formatter.h> -# include <__chrono/ostream.h> -# include <__chrono/parser_std_format_spec.h> -# include <__chrono/statically_widen.h> +# include <__cxx03/__chrono/formatter.h> +# include <__cxx03/__chrono/ostream.h> +# include <__cxx03/__chrono/parser_std_format_spec.h> +# include <__cxx03/__chrono/statically_widen.h> # endif # if !defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) && !defined(_LIBCPP_HAS_NO_FILESYSTEM) && \ !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include <__chrono/leap_second.h> -# include <__chrono/time_zone.h> -# include <__chrono/time_zone_link.h> -# include <__chrono/tzdb.h> -# include <__chrono/tzdb_list.h> -# include <__chrono/zoned_time.h> +# include <__cxx03/__chrono/leap_second.h> +# include <__cxx03/__chrono/time_zone.h> +# include <__cxx03/__chrono/time_zone_link.h> +# include <__cxx03/__chrono/tzdb.h> +# include <__cxx03/__chrono/tzdb_list.h> +# include <__cxx03/__chrono/zoned_time.h> # endif #endif -#include +#include <__cxx03/version> // standard-mandated includes // [time.syn] -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include -# include -# include -# include +# include <__cxx03/cstdint> +# include <__cxx03/stdexcept> +# include <__cxx03/string_view> +# include <__cxx03/vector> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/bit> +# include <__cxx03/concepts> +# include <__cxx03/cstring> +# include <__cxx03/forward_list> +# include <__cxx03/string> +# include <__cxx03/tuple> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER == 20 -# include +# include <__cxx03/charconv> # if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include -# include +# include <__cxx03/locale> +# include <__cxx03/ostream> # endif #endif diff --git a/libcxx/include/__cxx03/cinttypes b/libcxx/include/__cxx03/cinttypes index 52663a4f35fad..146f5b8fce0c9 100644 --- a/libcxx/include/__cxx03/cinttypes +++ b/libcxx/include/__cxx03/cinttypes @@ -15,7 +15,7 @@ This entire header is C99 / C++0X -#include // includes +#include <__cxx03/cstdint> // includes Macros: @@ -234,14 +234,14 @@ uintmax_t wcstoumax(const wchar_t* restrict nptr, wchar_t** restrict endptr, int } // std */ -#include <__config> +#include <__cxx03/__config> // standard-mandated includes // [cinttypes.syn] -#include +#include <__cxx03/cstdint> -#include +#include <__cxx03/inttypes.h> #ifndef _LIBCPP_INTTYPES_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/ciso646 b/libcxx/include/__cxx03/ciso646 index 1d859f08fac57..7109a31b52f24 100644 --- a/libcxx/include/__cxx03/ciso646 +++ b/libcxx/include/__cxx03/ciso646 @@ -15,7 +15,7 @@ */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/climits b/libcxx/include/__cxx03/climits index bcd8b4a56a073..aff33b687bdef 100644 --- a/libcxx/include/__cxx03/climits +++ b/libcxx/include/__cxx03/climits @@ -37,9 +37,9 @@ Macros: */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/limits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/clocale b/libcxx/include/__cxx03/clocale index c689a64be288a..d8fc91465ef74 100644 --- a/libcxx/include/__cxx03/clocale +++ b/libcxx/include/__cxx03/clocale @@ -34,9 +34,9 @@ lconv* localeconv(); */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/locale.h> #ifndef _LIBCPP_LOCALE_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cmath b/libcxx/include/__cxx03/cmath index 6480c4678ce33..659b5c965d6e9 100644 --- a/libcxx/include/__cxx03/cmath +++ b/libcxx/include/__cxx03/cmath @@ -312,20 +312,20 @@ constexpr long double lerp(long double a, long double b, long double t) noexcept */ -#include <__config> -#include <__math/hypot.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_same.h> -#include <__type_traits/promote.h> -#include <__type_traits/remove_cv.h> -#include -#include - -#include <__math/special_functions.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__math/hypot.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/promote.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/limits> +#include <__cxx03/version> + +#include <__cxx03/__math/special_functions.h> +#include <__cxx03/math.h> #ifndef _LIBCPP_MATH_H # error tried including but didn't find libc++'s header. \ @@ -340,7 +340,7 @@ constexpr long double lerp(long double a, long double b, long double t) noexcept #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -640,7 +640,7 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_CMATH diff --git a/libcxx/include/__cxx03/codecvt b/libcxx/include/__cxx03/codecvt index 65cd752d69460..827822bf12466 100644 --- a/libcxx/include/__cxx03/codecvt +++ b/libcxx/include/__cxx03/codecvt @@ -54,9 +54,9 @@ class codecvt_utf8_utf16 */ -#include <__config> -#include <__locale> -#include +#include <__cxx03/__config> +#include <__cxx03/__locale> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -579,19 +579,19 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstddef> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/initializer_list> +# include <__cxx03/iosfwd> +# include <__cxx03/limits> +# include <__cxx03/mutex> +# include <__cxx03/new> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> #endif #endif // _LIBCPP_CODECVT diff --git a/libcxx/include/__cxx03/compare b/libcxx/include/__cxx03/compare index 8a41835b14897..03b87ea1d1071 100644 --- a/libcxx/include/__cxx03/compare +++ b/libcxx/include/__cxx03/compare @@ -140,39 +140,39 @@ namespace std { } */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 20 -# include <__compare/common_comparison_category.h> -# include <__compare/compare_partial_order_fallback.h> -# include <__compare/compare_strong_order_fallback.h> -# include <__compare/compare_three_way.h> -# include <__compare/compare_three_way_result.h> -# include <__compare/compare_weak_order_fallback.h> -# include <__compare/is_eq.h> -# include <__compare/ordering.h> -# include <__compare/partial_order.h> -# include <__compare/strong_order.h> -# include <__compare/synth_three_way.h> -# include <__compare/three_way_comparable.h> -# include <__compare/weak_order.h> +# include <__cxx03/__compare/common_comparison_category.h> +# include <__cxx03/__compare/compare_partial_order_fallback.h> +# include <__cxx03/__compare/compare_strong_order_fallback.h> +# include <__cxx03/__compare/compare_three_way.h> +# include <__cxx03/__compare/compare_three_way_result.h> +# include <__cxx03/__compare/compare_weak_order_fallback.h> +# include <__cxx03/__compare/is_eq.h> +# include <__cxx03/__compare/ordering.h> +# include <__cxx03/__compare/partial_order.h> +# include <__cxx03/__compare/strong_order.h> +# include <__cxx03/__compare/synth_three_way.h> +# include <__cxx03/__compare/three_way_comparable.h> +# include <__cxx03/__compare/weak_order.h> #endif // _LIBCPP_STD_VER >= 20 -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include -# include -# include +# include <__cxx03/cstddef> +# include <__cxx03/cstdint> +# include <__cxx03/limits> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include +# include <__cxx03/cmath> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_COMPARE diff --git a/libcxx/include/__cxx03/complex b/libcxx/include/__cxx03/complex index e6534025de57e..0bad3de200084 100644 --- a/libcxx/include/__cxx03/complex +++ b/libcxx/include/__cxx03/complex @@ -256,18 +256,18 @@ template complex tanh (const complex&); */ -#include <__config> -#include <__fwd/complex.h> -#include <__fwd/tuple.h> -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_size.h> -#include <__type_traits/conditional.h> -#include <__utility/move.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/complex.h> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cmath> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include // for std::basic_ostringstream +# include <__cxx03/sstream> // for std::basic_ostringstream #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -275,7 +275,7 @@ template complex tanh (const complex&); #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -1472,9 +1472,9 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/iosfwd> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_COMPLEX diff --git a/libcxx/include/__cxx03/complex.h b/libcxx/include/__cxx03/complex.h index a3da21c843f36..a20ab88802060 100644 --- a/libcxx/include/__cxx03/complex.h +++ b/libcxx/include/__cxx03/complex.h @@ -13,18 +13,18 @@ /* complex.h synopsis -#include +#include <__cxx03/ccomplex> */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #ifdef __cplusplus -# include +# include <__cxx03/ccomplex> #elif __has_include_next() # include_next #endif diff --git a/libcxx/include/__cxx03/concepts b/libcxx/include/__cxx03/concepts index e89d216a59372..aa6e62e4d39c8 100644 --- a/libcxx/include/__cxx03/concepts +++ b/libcxx/include/__cxx03/concepts @@ -129,41 +129,41 @@ namespace std { */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 20 -# include <__concepts/arithmetic.h> -# include <__concepts/assignable.h> -# include <__concepts/boolean_testable.h> -# include <__concepts/class_or_enum.h> -# include <__concepts/common_reference_with.h> -# include <__concepts/common_with.h> -# include <__concepts/constructible.h> -# include <__concepts/convertible_to.h> -# include <__concepts/copyable.h> -# include <__concepts/derived_from.h> -# include <__concepts/destructible.h> -# include <__concepts/different_from.h> -# include <__concepts/equality_comparable.h> -# include <__concepts/invocable.h> -# include <__concepts/movable.h> -# include <__concepts/predicate.h> -# include <__concepts/regular.h> -# include <__concepts/relation.h> -# include <__concepts/same_as.h> -# include <__concepts/semiregular.h> -# include <__concepts/swappable.h> -# include <__concepts/totally_ordered.h> +# include <__cxx03/__concepts/arithmetic.h> +# include <__cxx03/__concepts/assignable.h> +# include <__cxx03/__concepts/boolean_testable.h> +# include <__cxx03/__concepts/class_or_enum.h> +# include <__cxx03/__concepts/common_reference_with.h> +# include <__cxx03/__concepts/common_with.h> +# include <__cxx03/__concepts/constructible.h> +# include <__cxx03/__concepts/convertible_to.h> +# include <__cxx03/__concepts/copyable.h> +# include <__cxx03/__concepts/derived_from.h> +# include <__cxx03/__concepts/destructible.h> +# include <__cxx03/__concepts/different_from.h> +# include <__cxx03/__concepts/equality_comparable.h> +# include <__cxx03/__concepts/invocable.h> +# include <__cxx03/__concepts/movable.h> +# include <__cxx03/__concepts/predicate.h> +# include <__cxx03/__concepts/regular.h> +# include <__cxx03/__concepts/relation.h> +# include <__cxx03/__concepts/same_as.h> +# include <__cxx03/__concepts/semiregular.h> +# include <__cxx03/__concepts/swappable.h> +# include <__cxx03/__concepts/totally_ordered.h> #endif // _LIBCPP_STD_VER >= 20 -#include +#include <__cxx03/version> #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include +# include <__cxx03/cstddef> #endif #if _LIBCPP_STD_VER <= 20 && !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) -# include +# include <__cxx03/type_traits> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__cxx03/condition_variable b/libcxx/include/__cxx03/condition_variable index 5195cd6057dd3..db862c44f9d3f 100644 --- a/libcxx/include/__cxx03/condition_variable +++ b/libcxx/include/__cxx03/condition_variable @@ -118,27 +118,27 @@ public: */ -#include <__chrono/duration.h> -#include <__chrono/steady_clock.h> -#include <__chrono/time_point.h> -#include <__condition_variable/condition_variable.h> -#include <__config> -#include <__memory/shared_ptr.h> -#include <__mutex/lock_guard.h> -#include <__mutex/mutex.h> -#include <__mutex/tag_types.h> -#include <__mutex/unique_lock.h> -#include <__stop_token/stop_callback.h> -#include <__stop_token/stop_token.h> -#include <__utility/move.h> -#include +#include <__cxx03/__chrono/duration.h> +#include <__cxx03/__chrono/steady_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__condition_variable/condition_variable.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__mutex/lock_guard.h> +#include <__cxx03/__mutex/mutex.h> +#include <__cxx03/__mutex/tag_types.h> +#include <__cxx03/__mutex/unique_lock.h> +#include <__cxx03/__stop_token/stop_callback.h> +#include <__cxx03/__stop_token/stop_token.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #ifndef _LIBCPP_HAS_NO_THREADS @@ -352,18 +352,18 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdint> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/initializer_list> +# include <__cxx03/iosfwd> +# include <__cxx03/new> +# include <__cxx03/stdexcept> +# include <__cxx03/system_error> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> #endif #endif // _LIBCPP_CONDITION_VARIABLE diff --git a/libcxx/include/__cxx03/coroutine b/libcxx/include/__cxx03/coroutine index ee54388ad5aaf..2a8a06e63cc28 100644 --- a/libcxx/include/__cxx03/coroutine +++ b/libcxx/include/__cxx03/coroutine @@ -38,30 +38,30 @@ struct suspend_always; */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 20 -# include <__coroutine/coroutine_handle.h> -# include <__coroutine/coroutine_traits.h> -# include <__coroutine/noop_coroutine_handle.h> -# include <__coroutine/trivial_awaitables.h> +# include <__cxx03/__coroutine/coroutine_handle.h> +# include <__cxx03/__coroutine/coroutine_traits.h> +# include <__cxx03/__coroutine/noop_coroutine_handle.h> +# include <__cxx03/__coroutine/trivial_awaitables.h> #endif // _LIBCPP_STD_VER >= 20 -#include +#include <__cxx03/version> // standard-mandated includes // [coroutine.syn] -#include +#include <__cxx03/compare> #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/iosfwd> +# include <__cxx03/limits> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_COROUTINE diff --git a/libcxx/include/__cxx03/csetjmp b/libcxx/include/__cxx03/csetjmp index 7ba90068710ae..be9391506d0f4 100644 --- a/libcxx/include/__cxx03/csetjmp +++ b/libcxx/include/__cxx03/csetjmp @@ -30,7 +30,7 @@ void longjmp(jmp_buf env, int val); */ -#include <__config> +#include <__cxx03/__config> // is not provided by libc++ #if __has_include() diff --git a/libcxx/include/__cxx03/csignal b/libcxx/include/__cxx03/csignal index 804a7f95ae968..f8f511fbca432 100644 --- a/libcxx/include/__cxx03/csignal +++ b/libcxx/include/__cxx03/csignal @@ -39,7 +39,7 @@ int raise(int sig); */ -#include <__config> +#include <__cxx03/__config> // is not provided by libc++ #if __has_include() diff --git a/libcxx/include/__cxx03/cstdarg b/libcxx/include/__cxx03/cstdarg index 4642eb7b5258c..ae04819a679ab 100644 --- a/libcxx/include/__cxx03/cstdarg +++ b/libcxx/include/__cxx03/cstdarg @@ -31,7 +31,7 @@ Types: */ -#include <__config> +#include <__cxx03/__config> // is not provided by libc++ #if __has_include() diff --git a/libcxx/include/__cxx03/cstdbool b/libcxx/include/__cxx03/cstdbool index ef731c021a4ab..7449c40780103 100644 --- a/libcxx/include/__cxx03/cstdbool +++ b/libcxx/include/__cxx03/cstdbool @@ -19,7 +19,7 @@ Macros: */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/cstddef b/libcxx/include/__cxx03/cstddef index 1a4049e4d34f2..284f2fb465617 100644 --- a/libcxx/include/__cxx03/cstddef +++ b/libcxx/include/__cxx03/cstddef @@ -33,13 +33,13 @@ Types: */ -#include <__config> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_integral.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/version> -#include +#include <__cxx03/stddef.h> #ifndef _LIBCPP_STDDEF_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cstdint b/libcxx/include/__cxx03/cstdint index 8c4782859426d..8dd7e8510130d 100644 --- a/libcxx/include/__cxx03/cstdint +++ b/libcxx/include/__cxx03/cstdint @@ -140,9 +140,9 @@ Types: } // std */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/stdint.h> #ifndef _LIBCPP_STDINT_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cstdio b/libcxx/include/__cxx03/cstdio index 7f94371081f8b..3085e6d3733ac 100644 --- a/libcxx/include/__cxx03/cstdio +++ b/libcxx/include/__cxx03/cstdio @@ -95,9 +95,9 @@ void perror(const char* s); } // std */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/stdio.h> #ifndef _LIBCPP_STDIO_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cstdlib b/libcxx/include/__cxx03/cstdlib index c817fd8f4accd..078db2b1a9115 100644 --- a/libcxx/include/__cxx03/cstdlib +++ b/libcxx/include/__cxx03/cstdlib @@ -81,9 +81,9 @@ void *aligned_alloc(size_t alignment, size_t size); // C11 */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/stdlib.h> #ifndef _LIBCPP_STDLIB_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cstring b/libcxx/include/__cxx03/cstring index c2c92b02e73cc..693777c0f0bdf 100644 --- a/libcxx/include/__cxx03/cstring +++ b/libcxx/include/__cxx03/cstring @@ -56,10 +56,10 @@ size_t strlen(const char* s); */ -#include <__config> -#include <__type_traits/is_constant_evaluated.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_constant_evaluated.h> -#include +#include <__cxx03/string.h> #ifndef _LIBCPP_STRING_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/ctgmath b/libcxx/include/__cxx03/ctgmath index 6237979be4906..5da5f6dfb66d6 100644 --- a/libcxx/include/__cxx03/ctgmath +++ b/libcxx/include/__cxx03/ctgmath @@ -13,13 +13,13 @@ /* ctgmath synopsis -#include -#include +#include <__cxx03/ccomplex> +#include <__cxx03/cmath> */ -#include -#include +#include <__cxx03/ccomplex> +#include <__cxx03/cmath> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/ctime b/libcxx/include/__cxx03/ctime index f47b49a43e23e..84080a9c5e953 100644 --- a/libcxx/include/__cxx03/ctime +++ b/libcxx/include/__cxx03/ctime @@ -45,7 +45,7 @@ int timespec_get( struct timespec *ts, int base); // C++17 */ -#include <__config> +#include <__cxx03/__config> // is not provided by libc++ #if __has_include() diff --git a/libcxx/include/__cxx03/ctype.h b/libcxx/include/__cxx03/ctype.h index 448e4410c554f..e169040376d4d 100644 --- a/libcxx/include/__cxx03/ctype.h +++ b/libcxx/include/__cxx03/ctype.h @@ -29,7 +29,7 @@ int tolower(int c); int toupper(int c); */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/cuchar b/libcxx/include/__cxx03/cuchar index f0015be275367..1029f7376756b 100644 --- a/libcxx/include/__cxx03/cuchar +++ b/libcxx/include/__cxx03/cuchar @@ -36,9 +36,9 @@ size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps); */ -#include <__config> +#include <__cxx03/__config> -#include +#include <__cxx03/uchar.h> #ifndef _LIBCPP_UCHAR_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/cwchar b/libcxx/include/__cxx03/cwchar index 08cfac58c846a..50fa5607fc4a1 100644 --- a/libcxx/include/__cxx03/cwchar +++ b/libcxx/include/__cxx03/cwchar @@ -102,15 +102,15 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len, */ -#include <__config> -#include <__type_traits/copy_cv.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_equality_comparable.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cv.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/copy_cv.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_equality_comparable.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/cwctype> -#include +#include <__cxx03/wchar.h> #ifndef _LIBCPP_WCHAR_H # error tried including but didn't find libc++'s header. \ @@ -255,7 +255,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __constexpr_wmemchr(_Tp _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/cstddef> #endif #endif // _LIBCPP_CWCHAR diff --git a/libcxx/include/__cxx03/cwctype b/libcxx/include/__cxx03/cwctype index 04abfabef5793..791277f9c1032 100644 --- a/libcxx/include/__cxx03/cwctype +++ b/libcxx/include/__cxx03/cwctype @@ -49,10 +49,10 @@ wctrans_t wctrans(const char* property); */ -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cctype> -#include +#include <__cxx03/wctype.h> #ifndef _LIBCPP_WCTYPE_H # error tried including but didn't find libc++'s header. \ diff --git a/libcxx/include/__cxx03/deque b/libcxx/include/__cxx03/deque index e73135a8647b9..7525a5b59a9bd 100644 --- a/libcxx/include/__cxx03/deque +++ b/libcxx/include/__cxx03/deque @@ -177,72 +177,72 @@ template */ -#include <__algorithm/copy.h> -#include <__algorithm/copy_backward.h> -#include <__algorithm/copy_n.h> -#include <__algorithm/equal.h> -#include <__algorithm/fill_n.h> -#include <__algorithm/lexicographical_compare.h> -#include <__algorithm/lexicographical_compare_three_way.h> -#include <__algorithm/min.h> -#include <__algorithm/remove.h> -#include <__algorithm/remove_if.h> -#include <__algorithm/unwrap_iter.h> -#include <__assert> -#include <__config> -#include <__debug_utils/sanitizers.h> -#include <__format/enable_insertable.h> -#include <__fwd/deque.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/next.h> -#include <__iterator/prev.h> -#include <__iterator/reverse_iterator.h> -#include <__iterator/segmented_iterator.h> -#include <__memory/addressof.h> -#include <__memory/allocator_destructor.h> -#include <__memory/pointer_traits.h> -#include <__memory/temp_value.h> -#include <__memory/unique_ptr.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__ranges/size.h> -#include <__split_buffer> -#include <__type_traits/is_allocator.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/type_identity.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include <__utility/swap.h> -#include -#include -#include +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/copy_backward.h> +#include <__cxx03/__algorithm/copy_n.h> +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__algorithm/lexicographical_compare.h> +#include <__cxx03/__algorithm/lexicographical_compare_three_way.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/remove.h> +#include <__cxx03/__algorithm/remove_if.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__debug_utils/sanitizers.h> +#include <__cxx03/__format/enable_insertable.h> +#include <__cxx03/__fwd/deque.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/prev.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__iterator/segmented_iterator.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator_destructor.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/temp_value.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__split_buffer> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/limits> +#include <__cxx03/stdexcept> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [deque.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -2607,15 +2607,15 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/functional> +# include <__cxx03/iosfwd> +# include <__cxx03/iterator> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> #endif #endif // _LIBCPP_DEQUE diff --git a/libcxx/include/__cxx03/errno.h b/libcxx/include/__cxx03/errno.h index e657ad84ff44b..0ebd03c0b6a62 100644 --- a/libcxx/include/__cxx03/errno.h +++ b/libcxx/include/__cxx03/errno.h @@ -22,7 +22,7 @@ */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/exception b/libcxx/include/__cxx03/exception index 5eff8e3f8a4bf..a6cb38526ff36 100644 --- a/libcxx/include/__cxx03/exception +++ b/libcxx/include/__cxx03/exception @@ -76,21 +76,21 @@ template void rethrow_if_nested(const E& e); */ -#include <__config> -#include <__exception/exception.h> -#include <__exception/exception_ptr.h> -#include <__exception/nested_exception.h> -#include <__exception/operations.h> -#include <__exception/terminate.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__exception/exception_ptr.h> +#include <__cxx03/__exception/nested_exception.h> +#include <__cxx03/__exception/operations.h> +#include <__cxx03/__exception/terminate.h> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include +# include <__cxx03/cstdlib> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_EXCEPTION diff --git a/libcxx/include/__cxx03/execution b/libcxx/include/__cxx03/execution index 94d434b2e4603..c04c665e6908b 100644 --- a/libcxx/include/__cxx03/execution +++ b/libcxx/include/__cxx03/execution @@ -32,11 +32,11 @@ namespace std { } */ -#include <__config> -#include <__type_traits/is_execution_policy.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cvref.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_execution_policy.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -143,7 +143,7 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/cstddef> #endif #endif // _LIBCPP_EXECUTION diff --git a/libcxx/include/__cxx03/expected b/libcxx/include/__cxx03/expected index 6a2f12f2bf3b5..c3bd4af17414d 100644 --- a/libcxx/include/__cxx03/expected +++ b/libcxx/include/__cxx03/expected @@ -38,25 +38,25 @@ namespace std { */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 23 -# include <__expected/bad_expected_access.h> -# include <__expected/expected.h> -# include <__expected/unexpect.h> -# include <__expected/unexpected.h> +# include <__cxx03/__expected/bad_expected_access.h> +# include <__cxx03/__expected/expected.h> +# include <__cxx03/__expected/unexpect.h> +# include <__cxx03/__expected/unexpected.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/cstddef> +# include <__cxx03/initializer_list> +# include <__cxx03/new> #endif #endif // _LIBCPP_EXPECTED diff --git a/libcxx/include/__cxx03/experimental/__config b/libcxx/include/__cxx03/experimental/__config index 7b23791511cef..8f6065f919c7c 100644 --- a/libcxx/include/__cxx03/experimental/__config +++ b/libcxx/include/__cxx03/experimental/__config @@ -10,7 +10,7 @@ #ifndef _LIBCPP_EXPERIMENTAL_CONFIG #define _LIBCPP_EXPERIMENTAL_CONFIG -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/experimental/__simd/aligned_tag.h b/libcxx/include/__cxx03/experimental/__simd/aligned_tag.h index 31d2b50aa1dd3..bbe6ee04d7c5e 100644 --- a/libcxx/include/__cxx03/experimental/__simd/aligned_tag.h +++ b/libcxx/include/__cxx03/experimental/__simd/aligned_tag.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H #define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H -#include <__memory/assume_aligned.h> -#include <__type_traits/remove_const.h> -#include -#include -#include +#include <__cxx03/__memory/assume_aligned.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> +#include <__cxx03/experimental/__simd/traits.h> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/__simd/declaration.h b/libcxx/include/__cxx03/experimental/__simd/declaration.h index 7b45d035c2712..7a57e3b888916 100644 --- a/libcxx/include/__cxx03/experimental/__simd/declaration.h +++ b/libcxx/include/__cxx03/experimental/__simd/declaration.h @@ -10,8 +10,8 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H #define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H -#include -#include +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/__simd/reference.h b/libcxx/include/__cxx03/experimental/__simd/reference.h index af61dbcc2fe92..b9e0289c0ee9b 100644 --- a/libcxx/include/__cxx03/experimental/__simd/reference.h +++ b/libcxx/include/__cxx03/experimental/__simd/reference.h @@ -10,16 +10,16 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H #define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_same.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include -#include -#include +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> +#include <__cxx03/experimental/__simd/utility.h> _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/__simd/scalar.h b/libcxx/include/__cxx03/experimental/__simd/scalar.h index 1add4653209ac..938fee89fa47a 100644 --- a/libcxx/include/__cxx03/experimental/__simd/scalar.h +++ b/libcxx/include/__cxx03/experimental/__simd/scalar.h @@ -10,11 +10,11 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H #define _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H -#include <__assert> -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> +#include <__cxx03/experimental/__simd/declaration.h> +#include <__cxx03/experimental/__simd/traits.h> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/__simd/simd.h b/libcxx/include/__cxx03/experimental/__simd/simd.h index 37e334aad6da0..4e170997fbada 100644 --- a/libcxx/include/__cxx03/experimental/__simd/simd.h +++ b/libcxx/include/__cxx03/experimental/__simd/simd.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cvref.h> -#include <__utility/forward.h> -#include -#include -#include -#include -#include -#include +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> +#include <__cxx03/experimental/__simd/declaration.h> +#include <__cxx03/experimental/__simd/reference.h> +#include <__cxx03/experimental/__simd/traits.h> +#include <__cxx03/experimental/__simd/utility.h> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/__simd/simd_mask.h b/libcxx/include/__cxx03/experimental/__simd/simd_mask.h index fd6dee2e28ee9..fa2f134ea7a3f 100644 --- a/libcxx/include/__cxx03/experimental/__simd/simd_mask.h +++ b/libcxx/include/__cxx03/experimental/__simd/simd_mask.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H -#include <__type_traits/is_same.h> -#include -#include -#include -#include -#include +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> +#include <__cxx03/experimental/__simd/declaration.h> +#include <__cxx03/experimental/__simd/reference.h> +#include <__cxx03/experimental/__simd/traits.h> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/__simd/traits.h b/libcxx/include/__cxx03/experimental/__simd/traits.h index ec25b4bfa7f95..3a0480b512e2e 100644 --- a/libcxx/include/__cxx03/experimental/__simd/traits.h +++ b/libcxx/include/__cxx03/experimental/__simd/traits.h @@ -10,13 +10,13 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H #define _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H -#include <__bit/bit_ceil.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_same.h> -#include -#include -#include -#include +#include <__cxx03/__bit/bit_ceil.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> +#include <__cxx03/experimental/__simd/declaration.h> +#include <__cxx03/experimental/__simd/utility.h> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/__simd/utility.h b/libcxx/include/__cxx03/experimental/__simd/utility.h index 708fa3d8f72ce..803378ddcdd3d 100644 --- a/libcxx/include/__cxx03/experimental/__simd/utility.h +++ b/libcxx/include/__cxx03/experimental/__simd/utility.h @@ -10,23 +10,23 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H #define _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_unsigned.h> -#include <__type_traits/is_volatile.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include <__utility/integer_sequence.h> -#include -#include -#include -#include +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/__type_traits/is_volatile.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> +#include <__cxx03/experimental/__config> +#include <__cxx03/limits> _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/__simd/vec_ext.h b/libcxx/include/__cxx03/experimental/__simd/vec_ext.h index 316866b84873d..99917e51a2217 100644 --- a/libcxx/include/__cxx03/experimental/__simd/vec_ext.h +++ b/libcxx/include/__cxx03/experimental/__simd/vec_ext.h @@ -10,15 +10,15 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_VEC_EXT_H #define _LIBCPP_EXPERIMENTAL___SIMD_VEC_EXT_H -#include <__assert> -#include <__bit/bit_ceil.h> -#include <__utility/forward.h> -#include <__utility/integer_sequence.h> -#include -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__bit/bit_ceil.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> +#include <__cxx03/experimental/__simd/declaration.h> +#include <__cxx03/experimental/__simd/traits.h> +#include <__cxx03/experimental/__simd/utility.h> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/__cxx03/experimental/iterator b/libcxx/include/__cxx03/experimental/iterator index de82da2d3d72b..d579927c548cf 100644 --- a/libcxx/include/__cxx03/experimental/iterator +++ b/libcxx/include/__cxx03/experimental/iterator @@ -52,19 +52,19 @@ namespace std { */ -#include <__memory/addressof.h> -#include <__type_traits/decay.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include -#include +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/experimental/__config> +#include <__cxx03/iterator> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 14 @@ -120,8 +120,8 @@ _LIBCPP_END_NAMESPACE_LFTS _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include +# include <__cxx03/iosfwd> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_EXPERIMENTAL_ITERATOR diff --git a/libcxx/include/__cxx03/experimental/memory b/libcxx/include/__cxx03/experimental/memory index e9663d43a8ab7..6e70a6d2f87c2 100644 --- a/libcxx/include/__cxx03/experimental/memory +++ b/libcxx/include/__cxx03/experimental/memory @@ -49,15 +49,15 @@ public: } */ -#include <__functional/hash.h> -#include <__functional/operations.h> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/common_type.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_convertible.h> -#include -#include +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -192,7 +192,7 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_ENABLE_EXPERIMENTAL #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/limits> #endif #endif /* _LIBCPP_EXPERIMENTAL_MEMORY */ diff --git a/libcxx/include/__cxx03/experimental/propagate_const b/libcxx/include/__cxx03/experimental/propagate_const index d7a695d838892..d439f7c52c0a6 100644 --- a/libcxx/include/__cxx03/experimental/propagate_const +++ b/libcxx/include/__cxx03/experimental/propagate_const @@ -107,35 +107,35 @@ */ -#include <__functional/operations.h> -#include <__fwd/functional.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_pointer.h> -#include <__type_traits/remove_reference.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include -#include +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cstddef> +#include <__cxx03/experimental/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if _LIBCPP_STD_VER >= 14 @@ -484,7 +484,7 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_EXPERIMENTAL_PROPAGATE_CONST diff --git a/libcxx/include/__cxx03/experimental/simd b/libcxx/include/__cxx03/experimental/simd index 484543b81daf1..4b0461cf4ef36 100644 --- a/libcxx/include/__cxx03/experimental/simd +++ b/libcxx/include/__cxx03/experimental/simd @@ -75,14 +75,14 @@ inline namespace parallelism_v2 { # pragma GCC system_header #endif -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include <__cxx03/experimental/__config> +#include <__cxx03/experimental/__simd/aligned_tag.h> +#include <__cxx03/experimental/__simd/declaration.h> +#include <__cxx03/experimental/__simd/reference.h> +#include <__cxx03/experimental/__simd/scalar.h> +#include <__cxx03/experimental/__simd/simd.h> +#include <__cxx03/experimental/__simd/simd_mask.h> +#include <__cxx03/experimental/__simd/traits.h> +#include <__cxx03/experimental/__simd/vec_ext.h> #endif /* _LIBCPP_EXPERIMENTAL_SIMD */ diff --git a/libcxx/include/__cxx03/experimental/type_traits b/libcxx/include/__cxx03/experimental/type_traits index 31b041bc94c43..47f29d83b0e6d 100644 --- a/libcxx/include/__cxx03/experimental/type_traits +++ b/libcxx/include/__cxx03/experimental/type_traits @@ -14,7 +14,7 @@ experimental/type_traits synopsis // C++1y -#include +#include <__cxx03/type_traits> namespace std { namespace experimental { @@ -68,12 +68,12 @@ inline namespace fundamentals_v1 { */ -#include +#include <__cxx03/experimental/__config> #if _LIBCPP_STD_VER >= 14 -# include -# include +# include <__cxx03/initializer_list> +# include <__cxx03/type_traits> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/experimental/utility b/libcxx/include/__cxx03/experimental/utility index 8bd0a055b7783..73239e7a4856a 100644 --- a/libcxx/include/__cxx03/experimental/utility +++ b/libcxx/include/__cxx03/experimental/utility @@ -15,7 +15,7 @@ // C++1y -#include +#include <__cxx03/utility> namespace std { namespace experimental { @@ -30,8 +30,8 @@ inline namespace fundamentals_v1 { */ -#include -#include +#include <__cxx03/experimental/__config> +#include <__cxx03/utility> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/ext/__hash b/libcxx/include/__cxx03/ext/__hash index 67f7e351756fb..3b363257b1a0b 100644 --- a/libcxx/include/__cxx03/ext/__hash +++ b/libcxx/include/__cxx03/ext/__hash @@ -12,10 +12,10 @@ #pragma GCC system_header -#include <__config> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/cstring> +#include <__cxx03/stddef.h> +#include <__cxx03/string> namespace __gnu_cxx { diff --git a/libcxx/include/__cxx03/ext/hash_map b/libcxx/include/__cxx03/ext/hash_map index 7b5b31c408178..bffb68672e438 100644 --- a/libcxx/include/__cxx03/ext/hash_map +++ b/libcxx/include/__cxx03/ext/hash_map @@ -201,11 +201,11 @@ template */ -#include <__config> -#include <__hash_table> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__hash_table> +#include <__cxx03/algorithm> +#include <__cxx03/ext/__hash> +#include <__cxx03/functional> #if defined(__DEPRECATED) && __DEPRECATED # if defined(_LIBCPP_WARNING) @@ -864,9 +864,9 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const hash_multimap<_Key, _Tp, _Has } // namespace __gnu_cxx #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/iterator> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_HASH_MAP diff --git a/libcxx/include/__cxx03/ext/hash_set b/libcxx/include/__cxx03/ext/hash_set index 1ab259b59979f..3886e578ec4aa 100644 --- a/libcxx/include/__cxx03/ext/hash_set +++ b/libcxx/include/__cxx03/ext/hash_set @@ -192,11 +192,11 @@ template */ -#include <__config> -#include <__hash_table> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__hash_table> +#include <__cxx03/algorithm> +#include <__cxx03/ext/__hash> +#include <__cxx03/functional> #if defined(__DEPRECATED) && __DEPRECATED # if defined(_LIBCPP_WARNING) @@ -576,9 +576,9 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const hash_multiset<_Value, _Hash, } // namespace __gnu_cxx #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/iterator> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_HASH_SET diff --git a/libcxx/include/__cxx03/fenv.h b/libcxx/include/__cxx03/fenv.h index 5647f2b339555..dd9f381892b54 100644 --- a/libcxx/include/__cxx03/fenv.h +++ b/libcxx/include/__cxx03/fenv.h @@ -49,7 +49,7 @@ int feupdateenv(const fenv_t* envp); */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/filesystem b/libcxx/include/__cxx03/filesystem index 6ea04df0a089b..2f1393bcbcf61 100644 --- a/libcxx/include/__cxx03/filesystem +++ b/libcxx/include/__cxx03/filesystem @@ -533,45 +533,45 @@ inline constexpr bool std::ranges::enable_view +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 17 -# include <__filesystem/copy_options.h> -# include <__filesystem/directory_entry.h> -# include <__filesystem/directory_iterator.h> -# include <__filesystem/directory_options.h> -# include <__filesystem/file_status.h> -# include <__filesystem/file_time_type.h> -# include <__filesystem/file_type.h> -# include <__filesystem/filesystem_error.h> -# include <__filesystem/operations.h> -# include <__filesystem/path.h> -# include <__filesystem/path_iterator.h> -# include <__filesystem/perm_options.h> -# include <__filesystem/perms.h> -# include <__filesystem/recursive_directory_iterator.h> -# include <__filesystem/space_info.h> -# include <__filesystem/u8path.h> +# include <__cxx03/__filesystem/copy_options.h> +# include <__cxx03/__filesystem/directory_entry.h> +# include <__cxx03/__filesystem/directory_iterator.h> +# include <__cxx03/__filesystem/directory_options.h> +# include <__cxx03/__filesystem/file_status.h> +# include <__cxx03/__filesystem/file_time_type.h> +# include <__cxx03/__filesystem/file_type.h> +# include <__cxx03/__filesystem/filesystem_error.h> +# include <__cxx03/__filesystem/operations.h> +# include <__cxx03/__filesystem/path.h> +# include <__cxx03/__filesystem/path_iterator.h> +# include <__cxx03/__filesystem/perm_options.h> +# include <__cxx03/__filesystem/perms.h> +# include <__cxx03/__filesystem/recursive_directory_iterator.h> +# include <__cxx03/__filesystem/space_info.h> +# include <__cxx03/__filesystem/u8path.h> #endif -#include +#include <__cxx03/version> // standard-mandated includes // [fs.filesystem.syn] -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/iosfwd> +# include <__cxx03/new> +# include <__cxx03/system_error> #endif #endif // _LIBCPP_FILESYSTEM diff --git a/libcxx/include/__cxx03/float.h b/libcxx/include/__cxx03/float.h index d572866c1358b..535e73e461519 100644 --- a/libcxx/include/__cxx03/float.h +++ b/libcxx/include/__cxx03/float.h @@ -70,7 +70,7 @@ */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/format b/libcxx/include/__cxx03/format index a88b3ef8528e2..c8adb186f1024 100644 --- a/libcxx/include/__cxx03/format +++ b/libcxx/include/__cxx03/format @@ -188,69 +188,69 @@ namespace std { */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 20 -# include <__format/buffer.h> -# include <__format/concepts.h> -# include <__format/container_adaptor.h> -# include <__format/enable_insertable.h> -# include <__format/escaped_output_table.h> -# include <__format/extended_grapheme_cluster_table.h> -# include <__format/format_arg.h> -# include <__format/format_arg_store.h> -# include <__format/format_args.h> -# include <__format/format_context.h> -# include <__format/format_error.h> -# include <__format/format_functions.h> -# include <__format/format_parse_context.h> -# include <__format/format_string.h> -# include <__format/format_to_n_result.h> -# include <__format/formatter.h> -# include <__format/formatter_bool.h> -# include <__format/formatter_char.h> -# include <__format/formatter_floating_point.h> -# include <__format/formatter_integer.h> -# include <__format/formatter_pointer.h> -# include <__format/formatter_string.h> -# include <__format/formatter_tuple.h> -# include <__format/parser_std_format_spec.h> -# include <__format/range_default_formatter.h> -# include <__format/range_formatter.h> -# include <__format/unicode.h> -# include <__fwd/format.h> +# include <__cxx03/__format/buffer.h> +# include <__cxx03/__format/concepts.h> +# include <__cxx03/__format/container_adaptor.h> +# include <__cxx03/__format/enable_insertable.h> +# include <__cxx03/__format/escaped_output_table.h> +# include <__cxx03/__format/extended_grapheme_cluster_table.h> +# include <__cxx03/__format/format_arg.h> +# include <__cxx03/__format/format_arg_store.h> +# include <__cxx03/__format/format_args.h> +# include <__cxx03/__format/format_context.h> +# include <__cxx03/__format/format_error.h> +# include <__cxx03/__format/format_functions.h> +# include <__cxx03/__format/format_parse_context.h> +# include <__cxx03/__format/format_string.h> +# include <__cxx03/__format/format_to_n_result.h> +# include <__cxx03/__format/formatter.h> +# include <__cxx03/__format/formatter_bool.h> +# include <__cxx03/__format/formatter_char.h> +# include <__cxx03/__format/formatter_floating_point.h> +# include <__cxx03/__format/formatter_integer.h> +# include <__cxx03/__format/formatter_pointer.h> +# include <__cxx03/__format/formatter_string.h> +# include <__cxx03/__format/formatter_tuple.h> +# include <__cxx03/__format/parser_std_format_spec.h> +# include <__cxx03/__format/range_default_formatter.h> +# include <__cxx03/__format/range_formatter.h> +# include <__cxx03/__format/unicode.h> +# include <__cxx03/__fwd/format.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/array> +# include <__cxx03/cctype> +# include <__cxx03/cerrno> +# include <__cxx03/clocale> +# include <__cxx03/cmath> +# include <__cxx03/cstddef> +# include <__cxx03/cstdint> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/initializer_list> +# include <__cxx03/limits> +# include <__cxx03/locale> +# include <__cxx03/new> +# include <__cxx03/optional> +# include <__cxx03/queue> +# include <__cxx03/stack> +# include <__cxx03/stdexcept> +# include <__cxx03/string> +# include <__cxx03/string_view> +# include <__cxx03/tuple> # if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) -# include +# include <__cxx03/cwchar> # endif #endif diff --git a/libcxx/include/__cxx03/forward_list b/libcxx/include/__cxx03/forward_list index 7330a6619be73..c83ab2582372d 100644 --- a/libcxx/include/__cxx03/forward_list +++ b/libcxx/include/__cxx03/forward_list @@ -195,62 +195,62 @@ template */ -#include <__algorithm/comp.h> -#include <__algorithm/lexicographical_compare.h> -#include <__algorithm/lexicographical_compare_three_way.h> -#include <__algorithm/min.h> -#include <__config> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/move_iterator.h> -#include <__iterator/next.h> -#include <__memory/addressof.h> -#include <__memory/allocation_guard.h> -#include <__memory/allocator.h> -#include <__memory/allocator_traits.h> -#include <__memory/compressed_pair.h> -#include <__memory/construct_at.h> -#include <__memory/pointer_traits.h> -#include <__memory/swap_allocator.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_allocator.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/type_identity.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include -#include // __launder -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/lexicographical_compare.h> +#include <__cxx03/__algorithm/lexicographical_compare_three_way.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/move_iterator.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocation_guard.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/swap_allocator.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/limits> +#include <__cxx03/new> // __launder +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [forward.list.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -1557,18 +1557,18 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdint> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/functional> +# include <__cxx03/iosfwd> +# include <__cxx03/iterator> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> #endif #endif // _LIBCPP_FORWARD_LIST diff --git a/libcxx/include/__cxx03/fstream b/libcxx/include/__cxx03/fstream index ab5ebf8e2c3d3..6fbced3caec3e 100644 --- a/libcxx/include/__cxx03/fstream +++ b/libcxx/include/__cxx03/fstream @@ -186,29 +186,29 @@ typedef basic_fstream wfstream; */ -#include <__algorithm/max.h> -#include <__assert> -#include <__config> -#include <__fwd/fstream.h> -#include <__locale> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_same.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include <__utility/unreachable.h> -#include -#include -#include -#include -#include -#include +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__fwd/fstream.h> +#include <__cxx03/__locale> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/cstdio> +#include <__cxx03/filesystem> +#include <__cxx03/istream> +#include <__cxx03/ostream> +#include <__cxx03/typeinfo> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if defined(_LIBCPP_MSVCRT) || defined(_NEWLIB_VERSION) # define _LIBCPP_HAS_NO_OFF_T_FUNCTIONS @@ -1558,15 +1558,15 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/iosfwd> +# include <__cxx03/limits> +# include <__cxx03/mutex> +# include <__cxx03/new> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_FSTREAM diff --git a/libcxx/include/__cxx03/functional b/libcxx/include/__cxx03/functional index 3d39f654ddb08..406c2ce297259 100644 --- a/libcxx/include/__cxx03/functional +++ b/libcxx/include/__cxx03/functional @@ -527,72 +527,72 @@ POLICY: For non-variadic implementations, the number of arguments is limited */ -#include <__config> - -#include <__functional/binary_function.h> -#include <__functional/binary_negate.h> -#include <__functional/bind.h> -#include <__functional/binder1st.h> -#include <__functional/binder2nd.h> -#include <__functional/hash.h> -#include <__functional/mem_fn.h> // TODO: deprecate -#include <__functional/mem_fun_ref.h> -#include <__functional/operations.h> -#include <__functional/pointer_to_binary_function.h> -#include <__functional/pointer_to_unary_function.h> -#include <__functional/reference_wrapper.h> -#include <__functional/unary_function.h> -#include <__functional/unary_negate.h> +#include <__cxx03/__config> + +#include <__cxx03/__functional/binary_function.h> +#include <__cxx03/__functional/binary_negate.h> +#include <__cxx03/__functional/bind.h> +#include <__cxx03/__functional/binder1st.h> +#include <__cxx03/__functional/binder2nd.h> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/mem_fn.h> // TODO: deprecate +#include <__cxx03/__functional/mem_fun_ref.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__functional/pointer_to_binary_function.h> +#include <__cxx03/__functional/pointer_to_unary_function.h> +#include <__cxx03/__functional/reference_wrapper.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__functional/unary_negate.h> #ifndef _LIBCPP_CXX03_LANG -# include <__functional/function.h> +# include <__cxx03/__functional/function.h> #endif #if _LIBCPP_STD_VER >= 17 -# include <__functional/boyer_moore_searcher.h> -# include <__functional/default_searcher.h> -# include <__functional/invoke.h> -# include <__functional/not_fn.h> +# include <__cxx03/__functional/boyer_moore_searcher.h> +# include <__cxx03/__functional/default_searcher.h> +# include <__cxx03/__functional/invoke.h> +# include <__cxx03/__functional/not_fn.h> #endif #if _LIBCPP_STD_VER >= 20 -# include <__functional/bind_back.h> -# include <__functional/bind_front.h> -# include <__functional/identity.h> -# include <__functional/ranges_operations.h> -# include <__type_traits/unwrap_ref.h> +# include <__cxx03/__functional/bind_back.h> +# include <__cxx03/__functional/bind_front.h> +# include <__cxx03/__functional/identity.h> +# include <__cxx03/__functional/ranges_operations.h> +# include <__cxx03/__type_traits/unwrap_ref.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && defined(_LIBCPP_CXX03_LANG) -# include -# include +# include <__cxx03/limits> +# include <__cxx03/new> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 -# include -# include -# include -# include +# include <__cxx03/array> +# include <__cxx03/initializer_list> +# include <__cxx03/unordered_map> +# include <__cxx03/vector> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/exception> +# include <__cxx03/iosfwd> +# include <__cxx03/memory> +# include <__cxx03/stdexcept> +# include <__cxx03/tuple> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> #endif #endif // _LIBCPP_FUNCTIONAL diff --git a/libcxx/include/__cxx03/future b/libcxx/include/__cxx03/future index 0be32620139e3..d40749d7375ea 100644 --- a/libcxx/include/__cxx03/future +++ b/libcxx/include/__cxx03/future @@ -362,44 +362,44 @@ template struct uses_allocator, Alloc>; */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include <__assert> -# include <__chrono/duration.h> -# include <__chrono/time_point.h> -# include <__exception/exception_ptr.h> -# include <__memory/addressof.h> -# include <__memory/allocator.h> -# include <__memory/allocator_arg_t.h> -# include <__memory/allocator_destructor.h> -# include <__memory/allocator_traits.h> -# include <__memory/compressed_pair.h> -# include <__memory/pointer_traits.h> -# include <__memory/shared_ptr.h> -# include <__memory/unique_ptr.h> -# include <__memory/uses_allocator.h> -# include <__system_error/error_category.h> -# include <__system_error/error_code.h> -# include <__system_error/error_condition.h> -# include <__type_traits/aligned_storage.h> -# include <__type_traits/strip_signature.h> -# include <__utility/auto_cast.h> -# include <__utility/forward.h> -# include <__utility/move.h> -# include -# include -# include -# include -# include +# include <__cxx03/__assert> +# include <__cxx03/__chrono/duration.h> +# include <__cxx03/__chrono/time_point.h> +# include <__cxx03/__exception/exception_ptr.h> +# include <__cxx03/__memory/addressof.h> +# include <__cxx03/__memory/allocator.h> +# include <__cxx03/__memory/allocator_arg_t.h> +# include <__cxx03/__memory/allocator_destructor.h> +# include <__cxx03/__memory/allocator_traits.h> +# include <__cxx03/__memory/compressed_pair.h> +# include <__cxx03/__memory/pointer_traits.h> +# include <__cxx03/__memory/shared_ptr.h> +# include <__cxx03/__memory/unique_ptr.h> +# include <__cxx03/__memory/uses_allocator.h> +# include <__cxx03/__system_error/error_category.h> +# include <__cxx03/__system_error/error_code.h> +# include <__cxx03/__system_error/error_condition.h> +# include <__cxx03/__type_traits/aligned_storage.h> +# include <__cxx03/__type_traits/strip_signature.h> +# include <__cxx03/__utility/auto_cast.h> +# include <__cxx03/__utility/forward.h> +# include <__cxx03/__utility/move.h> +# include <__cxx03/mutex> +# include <__cxx03/new> +# include <__cxx03/stdexcept> +# include <__cxx03/thread> +# include <__cxx03/version> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -2048,15 +2048,15 @@ _LIBCPP_POP_MACROS #endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include +# include <__cxx03/chrono> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/cstdlib> +# include <__cxx03/exception> +# include <__cxx03/iosfwd> +# include <__cxx03/system_error> #endif #endif // _LIBCPP_FUTURE diff --git a/libcxx/include/__cxx03/initializer_list b/libcxx/include/__cxx03/initializer_list index 680ca1cd20d55..b891c5e51b7a1 100644 --- a/libcxx/include/__cxx03/initializer_list +++ b/libcxx/include/__cxx03/initializer_list @@ -42,8 +42,8 @@ template const E* end(initializer_list il) noexcept; // constexpr in */ -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/cstddef> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/inttypes.h b/libcxx/include/__cxx03/inttypes.h index 8664412bd52ff..9101fab9ab080 100644 --- a/libcxx/include/__cxx03/inttypes.h +++ b/libcxx/include/__cxx03/inttypes.h @@ -20,7 +20,7 @@ This entire header is C99 / C++0X -#include // includes +#include <__cxx03/stdint.h> // includes Macros: @@ -235,7 +235,7 @@ uintmax_t wcstoumax(const wchar_t* restrict nptr, wchar_t** restrict endptr, int */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -254,7 +254,7 @@ uintmax_t wcstoumax(const wchar_t* restrict nptr, wchar_t** restrict endptr, int #ifdef __cplusplus -# include +# include <__cxx03/stdint.h> # undef imaxabs # undef imaxdiv diff --git a/libcxx/include/__cxx03/iomanip b/libcxx/include/__cxx03/iomanip index fb4f15b9a5853..768bee9f22664 100644 --- a/libcxx/include/__cxx03/iomanip +++ b/libcxx/include/__cxx03/iomanip @@ -42,9 +42,9 @@ template */ -#include <__config> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/istream> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/ios b/libcxx/include/__cxx03/ios index fae71caa87ba3..a9eeefc0883d4 100644 --- a/libcxx/include/__cxx03/ios +++ b/libcxx/include/__cxx03/ios @@ -13,7 +13,7 @@ /* ios synopsis -#include +#include <__cxx03/iosfwd> namespace std { @@ -211,28 +211,28 @@ storage-class-specifier const error_category& iostream_category() noexcept; */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include <__fwd/ios.h> -# include <__ios/fpos.h> -# include <__locale> -# include <__system_error/error_category.h> -# include <__system_error/error_code.h> -# include <__system_error/error_condition.h> -# include <__system_error/system_error.h> -# include <__utility/swap.h> -# include <__verbose_abort> -# include +# include <__cxx03/__fwd/ios.h> +# include <__cxx03/__ios/fpos.h> +# include <__cxx03/__locale> +# include <__cxx03/__system_error/error_category.h> +# include <__cxx03/__system_error/error_code.h> +# include <__cxx03/__system_error/error_condition.h> +# include <__cxx03/__system_error/system_error.h> +# include <__cxx03/__utility/swap.h> +# include <__cxx03/__verbose_abort> +# include <__cxx03/version> // standard-mandated includes // [ios.syn] -# include +# include <__cxx03/iosfwd> # if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) -# include <__atomic/atomic.h> // for __xindex_ +# include <__cxx03/__atomic/atomic.h> // for __xindex_ # endif # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -240,7 +240,7 @@ storage-class-specifier const error_category& iostream_category() noexcept; # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -871,19 +871,19 @@ _LIBCPP_POP_MACROS #endif // !defined(_LIBCPP_HAS_NO_LOCALIZATION) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstddef> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/initializer_list> +# include <__cxx03/limits> +# include <__cxx03/mutex> +# include <__cxx03/new> +# include <__cxx03/stdexcept> +# include <__cxx03/system_error> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> #endif #endif // _LIBCPP_IOS diff --git a/libcxx/include/__cxx03/iosfwd b/libcxx/include/__cxx03/iosfwd index eeafcc37c598e..9c4c86bc2fa27 100644 --- a/libcxx/include/__cxx03/iosfwd +++ b/libcxx/include/__cxx03/iosfwd @@ -105,17 +105,17 @@ using wosyncstream = basic_osyncstream; // C++20 */ -#include <__config> -#include <__fwd/fstream.h> -#include <__fwd/ios.h> -#include <__fwd/istream.h> -#include <__fwd/memory.h> -#include <__fwd/ostream.h> -#include <__fwd/sstream.h> -#include <__fwd/streambuf.h> -#include <__fwd/string.h> -#include <__std_mbstate_t.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/fstream.h> +#include <__cxx03/__fwd/ios.h> +#include <__cxx03/__fwd/istream.h> +#include <__cxx03/__fwd/memory.h> +#include <__cxx03/__fwd/ostream.h> +#include <__cxx03/__fwd/sstream.h> +#include <__cxx03/__fwd/streambuf.h> +#include <__cxx03/__fwd/string.h> +#include <__cxx03/__std_mbstate_t.h> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/iostream b/libcxx/include/__cxx03/iostream index 5df45c6d3f78e..c4923117822c7 100644 --- a/libcxx/include/__cxx03/iostream +++ b/libcxx/include/__cxx03/iostream @@ -13,10 +13,10 @@ /* iostream synopsis -#include -#include -#include -#include +#include <__cxx03/ios> +#include <__cxx03/istream> +#include <__cxx03/ostream> +#include <__cxx03/streambuf> namespace std { @@ -33,16 +33,16 @@ extern wostream wclog; */ -#include <__config> -#include +#include <__cxx03/__config> +#include <__cxx03/version> // standard-mandated includes // [iostream.syn] -#include -#include -#include -#include +#include <__cxx03/ios> +#include <__cxx03/istream> +#include <__cxx03/ostream> +#include <__cxx03/streambuf> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/istream b/libcxx/include/__cxx03/istream index d2b577a9ad9ef..c2fc00298bd58 100644 --- a/libcxx/include/__cxx03/istream +++ b/libcxx/include/__cxx03/istream @@ -158,26 +158,26 @@ template */ -#include <__config> -#include <__fwd/istream.h> -#include <__iterator/istreambuf_iterator.h> -#include <__ostream/basic_ostream.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/is_base_of.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/istream.h> +#include <__cxx03/__iterator/istreambuf_iterator.h> +#include <__cxx03/__ostream/basic_ostream.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/is_base_of.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/bitset> +#include <__cxx03/ios> +#include <__cxx03/locale> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -1362,10 +1362,10 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_iostream; _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/iosfwd> +# include <__cxx03/ostream> +# include <__cxx03/type_traits> #endif _LIBCPP_POP_MACROS diff --git a/libcxx/include/__cxx03/iterator b/libcxx/include/__cxx03/iterator index fca75f0a19ed1..f415443f8dc1d 100644 --- a/libcxx/include/__cxx03/iterator +++ b/libcxx/include/__cxx03/iterator @@ -13,7 +13,7 @@ /* iterator synopsis -#include +#include <__cxx03/concepts> namespace std { @@ -679,76 +679,76 @@ template constexpr const E* data(initializer_list il) noexcept; */ -#include <__config> -#include <__iterator/access.h> -#include <__iterator/advance.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/distance.h> -#include <__iterator/front_insert_iterator.h> -#include <__iterator/insert_iterator.h> -#include <__iterator/istream_iterator.h> -#include <__iterator/istreambuf_iterator.h> -#include <__iterator/iterator.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/move_iterator.h> -#include <__iterator/next.h> -#include <__iterator/ostream_iterator.h> -#include <__iterator/ostreambuf_iterator.h> -#include <__iterator/prev.h> -#include <__iterator/reverse_iterator.h> -#include <__iterator/wrap_iter.h> +#include <__cxx03/__config> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/front_insert_iterator.h> +#include <__cxx03/__iterator/insert_iterator.h> +#include <__cxx03/__iterator/istream_iterator.h> +#include <__cxx03/__iterator/istreambuf_iterator.h> +#include <__cxx03/__iterator/iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/move_iterator.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/ostream_iterator.h> +#include <__cxx03/__iterator/ostreambuf_iterator.h> +#include <__cxx03/__iterator/prev.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__iterator/wrap_iter.h> #if _LIBCPP_STD_VER >= 14 -# include <__iterator/reverse_access.h> +# include <__cxx03/__iterator/reverse_access.h> #endif #if _LIBCPP_STD_VER >= 17 -# include <__iterator/data.h> -# include <__iterator/empty.h> -# include <__iterator/size.h> +# include <__cxx03/__iterator/data.h> +# include <__cxx03/__iterator/empty.h> +# include <__cxx03/__iterator/size.h> #endif #if _LIBCPP_STD_VER >= 20 -# include <__iterator/common_iterator.h> -# include <__iterator/concepts.h> -# include <__iterator/counted_iterator.h> -# include <__iterator/default_sentinel.h> -# include <__iterator/incrementable_traits.h> -# include <__iterator/indirectly_comparable.h> -# include <__iterator/iter_move.h> -# include <__iterator/iter_swap.h> -# include <__iterator/mergeable.h> -# include <__iterator/move_sentinel.h> -# include <__iterator/permutable.h> -# include <__iterator/projected.h> -# include <__iterator/readable_traits.h> -# include <__iterator/sortable.h> -# include <__iterator/unreachable_sentinel.h> +# include <__cxx03/__iterator/common_iterator.h> +# include <__cxx03/__iterator/concepts.h> +# include <__cxx03/__iterator/counted_iterator.h> +# include <__cxx03/__iterator/default_sentinel.h> +# include <__cxx03/__iterator/incrementable_traits.h> +# include <__cxx03/__iterator/indirectly_comparable.h> +# include <__cxx03/__iterator/iter_move.h> +# include <__cxx03/__iterator/iter_swap.h> +# include <__cxx03/__iterator/mergeable.h> +# include <__cxx03/__iterator/move_sentinel.h> +# include <__cxx03/__iterator/permutable.h> +# include <__cxx03/__iterator/projected.h> +# include <__cxx03/__iterator/readable_traits.h> +# include <__cxx03/__iterator/sortable.h> +# include <__cxx03/__iterator/unreachable_sentinel.h> #endif -#include +#include <__cxx03/version> // standard-mandated includes // [iterator.synopsis] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/concepts> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include +# include <__cxx03/variant> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/cstdlib> +# include <__cxx03/exception> +# include <__cxx03/new> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> #endif #endif // _LIBCPP_ITERATOR diff --git a/libcxx/include/__cxx03/latch b/libcxx/include/__cxx03/latch index 81d6028a9c2ce..bcfb6a51243db 100644 --- a/libcxx/include/__cxx03/latch +++ b/libcxx/include/__cxx03/latch @@ -40,24 +40,24 @@ namespace std */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include <__assert> -# include <__atomic/atomic_base.h> -# include <__atomic/atomic_sync.h> -# include <__atomic/memory_order.h> -# include -# include -# include +# include <__cxx03/__assert> +# include <__cxx03/__atomic/atomic_base.h> +# include <__cxx03/__atomic/atomic_sync.h> +# include <__cxx03/__atomic/memory_order.h> +# include <__cxx03/cstddef> +# include <__cxx03/limits> +# include <__cxx03/version> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> # if _LIBCPP_STD_VER >= 14 @@ -123,7 +123,7 @@ _LIBCPP_POP_MACROS #endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/atomic> #endif #endif //_LIBCPP_LATCH diff --git a/libcxx/include/__cxx03/limits b/libcxx/include/__cxx03/limits index d55c7cd75f34f..d028d8256f59d 100644 --- a/libcxx/include/__cxx03/limits +++ b/libcxx/include/__cxx03/limits @@ -102,18 +102,18 @@ template<> class numeric_limits; */ -#include <__config> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_signed.h> -#include <__type_traits/remove_cv.h> +#include <__cxx03/__config> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/remove_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> -#include +#include <__cxx03/__undef_macros> +#include <__cxx03/version> _LIBCPP_BEGIN_NAMESPACE_STD @@ -578,7 +578,7 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_LIMITS diff --git a/libcxx/include/__cxx03/list b/libcxx/include/__cxx03/list index 76b1d9241b41c..742df50767e15 100644 --- a/libcxx/include/__cxx03/list +++ b/libcxx/include/__cxx03/list @@ -197,67 +197,67 @@ template */ -#include <__algorithm/comp.h> -#include <__algorithm/equal.h> -#include <__algorithm/lexicographical_compare.h> -#include <__algorithm/lexicographical_compare_three_way.h> -#include <__algorithm/min.h> -#include <__assert> -#include <__config> -#include <__format/enable_insertable.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/move_iterator.h> -#include <__iterator/next.h> -#include <__iterator/prev.h> -#include <__iterator/reverse_iterator.h> -#include <__memory/addressof.h> -#include <__memory/allocation_guard.h> -#include <__memory/allocator.h> -#include <__memory/allocator_traits.h> -#include <__memory/compressed_pair.h> -#include <__memory/construct_at.h> -#include <__memory/pointer_traits.h> -#include <__memory/swap_allocator.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_allocator.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_same.h> -#include <__type_traits/type_identity.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include -#include -#include // __launder -#include +#include <__cxx03/__algorithm/comp.h> +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/lexicographical_compare.h> +#include <__cxx03/__algorithm/lexicographical_compare_three_way.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__format/enable_insertable.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/move_iterator.h> +#include <__cxx03/__iterator/next.h> +#include <__cxx03/__iterator/prev.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocation_guard.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/swap_allocator.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cstring> +#include <__cxx03/limits> +#include <__cxx03/new> // __launder +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [list.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -1726,17 +1726,17 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdint> +# include <__cxx03/cstdlib> +# include <__cxx03/functional> +# include <__cxx03/iosfwd> +# include <__cxx03/iterator> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> #endif #endif // _LIBCPP_LIST diff --git a/libcxx/include/__cxx03/locale b/libcxx/include/__cxx03/locale index 573910a85bef5..48ae270e0949b 100644 --- a/libcxx/include/__cxx03/locale +++ b/libcxx/include/__cxx03/locale @@ -187,33 +187,33 @@ template class messages_byname; */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include <__algorithm/copy.h> -# include <__algorithm/equal.h> -# include <__algorithm/find.h> -# include <__algorithm/max.h> -# include <__algorithm/reverse.h> -# include <__algorithm/unwrap_iter.h> -# include <__assert> -# include <__iterator/access.h> -# include <__iterator/back_insert_iterator.h> -# include <__iterator/istreambuf_iterator.h> -# include <__iterator/ostreambuf_iterator.h> -# include <__locale> -# include <__memory/unique_ptr.h> -# include <__type_traits/make_unsigned.h> -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/__algorithm/copy.h> +# include <__cxx03/__algorithm/equal.h> +# include <__cxx03/__algorithm/find.h> +# include <__cxx03/__algorithm/max.h> +# include <__cxx03/__algorithm/reverse.h> +# include <__cxx03/__algorithm/unwrap_iter.h> +# include <__cxx03/__assert> +# include <__cxx03/__iterator/access.h> +# include <__cxx03/__iterator/back_insert_iterator.h> +# include <__cxx03/__iterator/istreambuf_iterator.h> +# include <__cxx03/__iterator/ostreambuf_iterator.h> +# include <__cxx03/__locale> +# include <__cxx03/__memory/unique_ptr.h> +# include <__cxx03/__type_traits/make_unsigned.h> +# include <__cxx03/cerrno> +# include <__cxx03/cstdio> +# include <__cxx03/cstdlib> +# include <__cxx03/ctime> +# include <__cxx03/ios> +# include <__cxx03/limits> +# include <__cxx03/new> +# include <__cxx03/streambuf> +# include <__cxx03/version> // TODO: Fix __bsd_locale_defaults.h // NOLINTBEGIN(libcpp-robust-against-adl) @@ -222,18 +222,18 @@ template class messages_byname; // Most unix variants have catopen. These are the specific ones that don't. # if !defined(__BIONIC__) && !defined(_NEWLIB_VERSION) && !defined(__EMSCRIPTEN__) # define _LIBCPP_HAS_CATOPEN 1 -# include +# include <__cxx03/nl_types.h> # endif # endif # ifdef _LIBCPP_LOCALE__L_EXTENSIONS -# include <__locale_dir/locale_base_api/bsd_locale_defaults.h> +# include <__cxx03/__locale_dir/locale_base_api/bsd_locale_defaults.h> # else -# include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h> +# include <__cxx03/__locale_dir/locale_base_api/bsd_locale_fallbacks.h> # endif # if defined(__APPLE__) || defined(__FreeBSD__) -# include +# include <__cxx03/xlocale.h> # endif # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -241,7 +241,7 @@ template class messages_byname; # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -3754,14 +3754,14 @@ _LIBCPP_POP_MACROS #endif // !defined(_LIBCPP_HAS_NO_LOCALIZATION) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdarg> +# include <__cxx03/iterator> +# include <__cxx03/mutex> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> #endif #endif // _LIBCPP_LOCALE diff --git a/libcxx/include/__cxx03/locale.h b/libcxx/include/__cxx03/locale.h index 425bf47d437ac..b33059d7965d6 100644 --- a/libcxx/include/__cxx03/locale.h +++ b/libcxx/include/__cxx03/locale.h @@ -33,7 +33,7 @@ */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/map b/libcxx/include/__cxx03/map index 02bd17ccb4e8c..53dcbaf4eae1d 100644 --- a/libcxx/include/__cxx03/map +++ b/libcxx/include/__cxx03/map @@ -571,53 +571,53 @@ erase_if(multimap& c, Predicate pred); // C++20 */ -#include <__algorithm/equal.h> -#include <__algorithm/lexicographical_compare.h> -#include <__algorithm/lexicographical_compare_three_way.h> -#include <__assert> -#include <__config> -#include <__functional/binary_function.h> -#include <__functional/is_transparent.h> -#include <__functional/operations.h> -#include <__iterator/erase_if_container.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/ranges_iterator_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__memory/addressof.h> -#include <__memory/allocator.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__node_handle> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__tree> -#include <__type_traits/is_allocator.h> -#include <__utility/forward.h> -#include <__utility/piecewise_construct.h> -#include <__utility/swap.h> -#include -#include -#include +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/lexicographical_compare.h> +#include <__cxx03/__algorithm/lexicographical_compare_three_way.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/binary_function.h> +#include <__cxx03/__functional/is_transparent.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/erase_if_container.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/ranges_iterator_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__node_handle> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__tree> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/piecewise_construct.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/stdexcept> +#include <__cxx03/tuple> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [associative.map.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -2170,12 +2170,12 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/functional> +# include <__cxx03/iterator> +# include <__cxx03/type_traits> +# include <__cxx03/utility> #endif #endif // _LIBCPP_MAP diff --git a/libcxx/include/__cxx03/math.h b/libcxx/include/__cxx03/math.h index 4e6304a753984..9a2509e93d01a 100644 --- a/libcxx/include/__cxx03/math.h +++ b/libcxx/include/__cxx03/math.h @@ -291,7 +291,7 @@ long double truncl(long double x); */ -# include <__config> +# include <__cxx03/__config> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -355,29 +355,29 @@ extern "C++" { # undef isunordered # endif -# include <__math/abs.h> -# include <__math/copysign.h> -# include <__math/error_functions.h> -# include <__math/exponential_functions.h> -# include <__math/fdim.h> -# include <__math/fma.h> -# include <__math/gamma.h> -# include <__math/hyperbolic_functions.h> -# include <__math/hypot.h> -# include <__math/inverse_hyperbolic_functions.h> -# include <__math/inverse_trigonometric_functions.h> -# include <__math/logarithms.h> -# include <__math/min_max.h> -# include <__math/modulo.h> -# include <__math/remainder.h> -# include <__math/roots.h> -# include <__math/rounding_functions.h> -# include <__math/traits.h> -# include <__math/trigonometric_functions.h> -# include <__type_traits/enable_if.h> -# include <__type_traits/is_floating_point.h> -# include <__type_traits/is_integral.h> -# include +# include <__cxx03/__math/abs.h> +# include <__cxx03/__math/copysign.h> +# include <__cxx03/__math/error_functions.h> +# include <__cxx03/__math/exponential_functions.h> +# include <__cxx03/__math/fdim.h> +# include <__cxx03/__math/fma.h> +# include <__cxx03/__math/gamma.h> +# include <__cxx03/__math/hyperbolic_functions.h> +# include <__cxx03/__math/hypot.h> +# include <__cxx03/__math/inverse_hyperbolic_functions.h> +# include <__cxx03/__math/inverse_trigonometric_functions.h> +# include <__cxx03/__math/logarithms.h> +# include <__cxx03/__math/min_max.h> +# include <__cxx03/__math/modulo.h> +# include <__cxx03/__math/remainder.h> +# include <__cxx03/__math/roots.h> +# include <__cxx03/__math/rounding_functions.h> +# include <__cxx03/__math/traits.h> +# include <__cxx03/__math/trigonometric_functions.h> +# include <__cxx03/__type_traits/enable_if.h> +# include <__cxx03/__type_traits/is_floating_point.h> +# include <__cxx03/__type_traits/is_integral.h> +# include <__cxx03/stdlib.h> // fpclassify relies on implementation-defined constants, so we can't move it to a detail header _LIBCPP_BEGIN_NAMESPACE_STD @@ -509,7 +509,7 @@ using std::__math::trunc; // extension which allows users to do: // // #define _USE_MATH_DEFINES -// #include +// #include <__cxx03/math.h> // // and receive the definitions of mathematical constants, even if // has previously been included. diff --git a/libcxx/include/__cxx03/mdspan b/libcxx/include/__cxx03/mdspan index 29190e4a9953e..950145b34b098 100644 --- a/libcxx/include/__cxx03/mdspan +++ b/libcxx/include/__cxx03/mdspan @@ -408,31 +408,31 @@ namespace std { #ifndef _LIBCPP_MDSPAN #define _LIBCPP_MDSPAN -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 23 -# include <__fwd/mdspan.h> -# include <__mdspan/default_accessor.h> -# include <__mdspan/extents.h> -# include <__mdspan/layout_left.h> -# include <__mdspan/layout_right.h> -# include <__mdspan/layout_stride.h> -# include <__mdspan/mdspan.h> +# include <__cxx03/__fwd/mdspan.h> +# include <__cxx03/__mdspan/default_accessor.h> +# include <__cxx03/__mdspan/extents.h> +# include <__cxx03/__mdspan/layout_left.h> +# include <__cxx03/__mdspan/layout_right.h> +# include <__cxx03/__mdspan/layout_stride.h> +# include <__cxx03/__mdspan/mdspan.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/array> +# include <__cxx03/cinttypes> +# include <__cxx03/concepts> +# include <__cxx03/cstddef> +# include <__cxx03/limits> +# include <__cxx03/span> #endif #endif // _LIBCPP_MDSPAN diff --git a/libcxx/include/__cxx03/memory b/libcxx/include/__cxx03/memory index b940a32c3ebe6..5bbcd8513e73d 100644 --- a/libcxx/include/__cxx03/memory +++ b/libcxx/include/__cxx03/memory @@ -934,65 +934,65 @@ template // clang-format on -#include <__config> -#include <__memory/addressof.h> -#include <__memory/align.h> -#include <__memory/allocator.h> -#include <__memory/allocator_arg_t.h> -#include <__memory/allocator_traits.h> -#include <__memory/auto_ptr.h> -#include <__memory/inout_ptr.h> -#include <__memory/out_ptr.h> -#include <__memory/pointer_traits.h> -#include <__memory/raw_storage_iterator.h> -#include <__memory/shared_ptr.h> -#include <__memory/temporary_buffer.h> -#include <__memory/uninitialized_algorithms.h> -#include <__memory/unique_ptr.h> -#include <__memory/uses_allocator.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/align.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/allocator_arg_t.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/auto_ptr.h> +#include <__cxx03/__memory/inout_ptr.h> +#include <__cxx03/__memory/out_ptr.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/raw_storage_iterator.h> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__memory/temporary_buffer.h> +#include <__cxx03/__memory/uninitialized_algorithms.h> +#include <__cxx03/__memory/unique_ptr.h> +#include <__cxx03/__memory/uses_allocator.h> // standard-mandated includes #if _LIBCPP_STD_VER >= 17 -# include <__memory/construct_at.h> +# include <__cxx03/__memory/construct_at.h> #endif #if _LIBCPP_STD_VER >= 20 -# include <__memory/assume_aligned.h> -# include <__memory/concepts.h> -# include <__memory/ranges_construct_at.h> -# include <__memory/ranges_uninitialized_algorithms.h> -# include <__memory/uses_allocator_construction.h> +# include <__cxx03/__memory/assume_aligned.h> +# include <__cxx03/__memory/concepts.h> +# include <__cxx03/__memory/ranges_construct_at.h> +# include <__cxx03/__memory/ranges_uninitialized_algorithms.h> +# include <__cxx03/__memory/uses_allocator_construction.h> #endif #if _LIBCPP_STD_VER >= 23 -# include <__memory/allocate_at_least.h> +# include <__cxx03/__memory/allocate_at_least.h> #endif -#include +#include <__cxx03/version> // [memory.syn] -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstddef> +# include <__cxx03/cstdint> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/iosfwd> +# include <__cxx03/iterator> +# include <__cxx03/new> +# include <__cxx03/stdexcept> +# include <__cxx03/tuple> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> #endif #endif // _LIBCPP_MEMORY diff --git a/libcxx/include/__cxx03/memory_resource b/libcxx/include/__cxx03/memory_resource index 67411054820a1..fb217e40f6301 100644 --- a/libcxx/include/__cxx03/memory_resource +++ b/libcxx/include/__cxx03/memory_resource @@ -49,35 +49,35 @@ namespace std::pmr { */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 17 -# include <__memory_resource/memory_resource.h> -# include <__memory_resource/monotonic_buffer_resource.h> -# include <__memory_resource/polymorphic_allocator.h> -# include <__memory_resource/pool_options.h> -# include <__memory_resource/synchronized_pool_resource.h> -# include <__memory_resource/unsynchronized_pool_resource.h> +# include <__cxx03/__memory_resource/memory_resource.h> +# include <__cxx03/__memory_resource/monotonic_buffer_resource.h> +# include <__cxx03/__memory_resource/polymorphic_allocator.h> +# include <__cxx03/__memory_resource/pool_options.h> +# include <__cxx03/__memory_resource/synchronized_pool_resource.h> +# include <__cxx03/__memory_resource/unsynchronized_pool_resource.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/cstddef> +# include <__cxx03/cstdint> +# include <__cxx03/limits> +# include <__cxx03/mutex> +# include <__cxx03/new> +# include <__cxx03/stdexcept> +# include <__cxx03/tuple> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/stdexcept> #endif #endif /* _LIBCPP_MEMORY_RESOURCE */ diff --git a/libcxx/include/__cxx03/mutex b/libcxx/include/__cxx03/mutex index 02c52dd72f02b..550d0f887dc49 100644 --- a/libcxx/include/__cxx03/mutex +++ b/libcxx/include/__cxx03/mutex @@ -186,32 +186,32 @@ template */ -#include <__chrono/steady_clock.h> -#include <__chrono/time_point.h> -#include <__condition_variable/condition_variable.h> -#include <__config> -#include <__memory/shared_ptr.h> -#include <__mutex/lock_guard.h> -#include <__mutex/mutex.h> -#include <__mutex/once_flag.h> -#include <__mutex/tag_types.h> -#include <__mutex/unique_lock.h> -#include <__thread/id.h> -#include <__thread/support.h> -#include <__utility/forward.h> -#include -#include +#include <__cxx03/__chrono/steady_clock.h> +#include <__cxx03/__chrono/time_point.h> +#include <__cxx03/__condition_variable/condition_variable.h> +#include <__cxx03/__config> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__mutex/lock_guard.h> +#include <__cxx03/__mutex/mutex.h> +#include <__cxx03/__mutex/once_flag.h> +#include <__cxx03/__mutex/tag_types.h> +#include <__cxx03/__mutex/unique_lock.h> +#include <__cxx03/__thread/id.h> +#include <__cxx03/__thread/support.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> +#include <__cxx03/limits> #ifndef _LIBCPP_CXX03_LANG -# include +# include <__cxx03/tuple> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -499,18 +499,18 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/ctime> +# include <__cxx03/initializer_list> +# include <__cxx03/iosfwd> +# include <__cxx03/new> +# include <__cxx03/stdexcept> +# include <__cxx03/system_error> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> #endif #endif // _LIBCPP_MUTEX diff --git a/libcxx/include/__cxx03/new b/libcxx/include/__cxx03/new index 214dbc398530b..7458fb0838150 100644 --- a/libcxx/include/__cxx03/new +++ b/libcxx/include/__cxx03/new @@ -86,17 +86,17 @@ void operator delete[](void* ptr, void*) noexcept; */ -#include <__config> -#include <__exception/exception.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_cv.h> -#include <__verbose_abort> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/cstddef> +#include <__cxx03/version> #if defined(_LIBCPP_ABI_VCRUNTIME) -# include +# include <__cxx03/new.h> #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -355,8 +355,8 @@ inline constexpr size_t hardware_constructive_interference_size = __GCC_CONSTRUC _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include +# include <__cxx03/cstdlib> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_NEW diff --git a/libcxx/include/__cxx03/numbers b/libcxx/include/__cxx03/numbers index f48ba4baf38ff..d8ca33295d551 100644 --- a/libcxx/include/__cxx03/numbers +++ b/libcxx/include/__cxx03/numbers @@ -58,9 +58,9 @@ namespace std::numbers { } */ -#include <__concepts/arithmetic.h> -#include <__config> -#include +#include <__cxx03/__concepts/arithmetic.h> +#include <__cxx03/__config> +#include <__cxx03/version> #if _LIBCPP_STD_VER >= 20 @@ -157,8 +157,8 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_STD_VER >= 20 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_NUMBERS diff --git a/libcxx/include/__cxx03/numeric b/libcxx/include/__cxx03/numeric index 6b92ce3a07123..85c756b021618 100644 --- a/libcxx/include/__cxx03/numeric +++ b/libcxx/include/__cxx03/numeric @@ -156,52 +156,52 @@ constexpr T saturate_cast(U x) noexcept; // freestanding, Sin */ -#include <__config> +#include <__cxx03/__config> -#include <__numeric/accumulate.h> -#include <__numeric/adjacent_difference.h> -#include <__numeric/inner_product.h> -#include <__numeric/iota.h> -#include <__numeric/partial_sum.h> +#include <__cxx03/__numeric/accumulate.h> +#include <__cxx03/__numeric/adjacent_difference.h> +#include <__cxx03/__numeric/inner_product.h> +#include <__cxx03/__numeric/iota.h> +#include <__cxx03/__numeric/partial_sum.h> #if _LIBCPP_STD_VER >= 17 -# include <__numeric/exclusive_scan.h> -# include <__numeric/gcd_lcm.h> -# include <__numeric/inclusive_scan.h> -# include <__numeric/pstl.h> -# include <__numeric/reduce.h> -# include <__numeric/transform_exclusive_scan.h> -# include <__numeric/transform_inclusive_scan.h> -# include <__numeric/transform_reduce.h> +# include <__cxx03/__numeric/exclusive_scan.h> +# include <__cxx03/__numeric/gcd_lcm.h> +# include <__cxx03/__numeric/inclusive_scan.h> +# include <__cxx03/__numeric/pstl.h> +# include <__cxx03/__numeric/reduce.h> +# include <__cxx03/__numeric/transform_exclusive_scan.h> +# include <__cxx03/__numeric/transform_inclusive_scan.h> +# include <__cxx03/__numeric/transform_reduce.h> #endif #if _LIBCPP_STD_VER >= 20 -# include <__numeric/midpoint.h> -# include <__numeric/saturation_arithmetic.h> +# include <__cxx03/__numeric/midpoint.h> +# include <__cxx03/__numeric/saturation_arithmetic.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 14 -# include -# include +# include <__cxx03/initializer_list> +# include <__cxx03/limits> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/climits> +# include <__cxx03/cmath> +# include <__cxx03/concepts> +# include <__cxx03/cstdint> +# include <__cxx03/execution> +# include <__cxx03/functional> +# include <__cxx03/iterator> +# include <__cxx03/new> +# include <__cxx03/optional> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_NUMERIC diff --git a/libcxx/include/__cxx03/optional b/libcxx/include/__cxx03/optional index 41d7515a2b689..32be4b0cd4db8 100644 --- a/libcxx/include/__cxx03/optional +++ b/libcxx/include/__cxx03/optional @@ -177,64 +177,64 @@ namespace std { */ -#include <__assert> -#include <__compare/compare_three_way_result.h> -#include <__compare/three_way_comparable.h> -#include <__concepts/invocable.h> -#include <__config> -#include <__exception/exception.h> -#include <__functional/hash.h> -#include <__functional/invoke.h> -#include <__functional/unary_function.h> -#include <__fwd/functional.h> -#include <__memory/addressof.h> -#include <__memory/construct_at.h> -#include <__tuple/sfinae_helpers.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/conditional.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/decay.h> -#include <__type_traits/disjunction.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_destructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_scalar.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/is_trivially_assignable.h> -#include <__type_traits/is_trivially_constructible.h> -#include <__type_traits/is_trivially_destructible.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/negation.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/in_place.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include <__verbose_abort> -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__compare/compare_three_way_result.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__concepts/invocable.h> +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__fwd/functional.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__tuple/sfinae_helpers.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/disjunction.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_destructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_scalar.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/is_trivially_assignable.h> +#include <__cxx03/__type_traits/is_trivially_constructible.h> +#include <__cxx03/__type_traits/is_trivially_destructible.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/negation.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/initializer_list> +#include <__cxx03/new> +#include <__cxx03/version> // standard-mandated includes // [optional.syn] -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> namespace std // purposefully not using versioning namespace { @@ -1285,20 +1285,20 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/climits> +# include <__cxx03/concepts> +# include <__cxx03/ctime> +# include <__cxx03/iterator> +# include <__cxx03/limits> +# include <__cxx03/memory> +# include <__cxx03/ratio> +# include <__cxx03/stdexcept> +# include <__cxx03/tuple> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> +# include <__cxx03/variant> #endif #endif // _LIBCPP_OPTIONAL diff --git a/libcxx/include/__cxx03/ostream b/libcxx/include/__cxx03/ostream index 359d3c0e19c4c..708434803542d 100644 --- a/libcxx/include/__cxx03/ostream +++ b/libcxx/include/__cxx03/ostream @@ -172,31 +172,31 @@ void vprint_nonunicode(ostream& os, string_view fmt, format_args args); */ -#include <__config> +#include <__cxx03/__config> -#include <__ostream/basic_ostream.h> +#include <__cxx03/__ostream/basic_ostream.h> #if _LIBCPP_STD_VER >= 23 -# include <__ostream/print.h> +# include <__cxx03/__ostream/print.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdio> +# include <__cxx03/cstdlib> +# include <__cxx03/format> +# include <__cxx03/iosfwd> +# include <__cxx03/iterator> +# include <__cxx03/print> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_OSTREAM diff --git a/libcxx/include/__cxx03/print b/libcxx/include/__cxx03/print index 1a579daff270f..bb7731abe4936 100644 --- a/libcxx/include/__cxx03/print +++ b/libcxx/include/__cxx03/print @@ -33,17 +33,17 @@ namespace std { } */ -#include <__assert> -#include <__concepts/same_as.h> -#include <__config> -#include <__system_error/system_error.h> -#include <__utility/forward.h> -#include -#include -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__system_error/system_error.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cerrno> +#include <__cxx03/cstdio> +#include <__cxx03/format> +#include <__cxx03/string> +#include <__cxx03/string_view> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/queue b/libcxx/include/__cxx03/queue index 9508de9f9eff2..7031e4264f40a 100644 --- a/libcxx/include/__cxx03/queue +++ b/libcxx/include/__cxx03/queue @@ -254,38 +254,38 @@ template */ -#include <__algorithm/make_heap.h> -#include <__algorithm/pop_heap.h> -#include <__algorithm/push_heap.h> -#include <__algorithm/ranges_copy.h> -#include <__config> -#include <__functional/operations.h> -#include <__fwd/deque.h> -#include <__fwd/queue.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/iterator_traits.h> -#include <__memory/uses_allocator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__utility/forward.h> -#include -#include -#include +#include <__cxx03/__algorithm/make_heap.h> +#include <__cxx03/__algorithm/pop_heap.h> +#include <__cxx03/__algorithm/push_heap.h> +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__fwd/deque.h> +#include <__cxx03/__fwd/queue.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/uses_allocator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/deque> +#include <__cxx03/vector> +#include <__cxx03/version> // standard-mandated includes // [queue.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -947,10 +947,10 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/functional> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_QUEUE diff --git a/libcxx/include/__cxx03/random b/libcxx/include/__cxx03/random index 6cc3760c20e16..6c6f730fbac7e 100644 --- a/libcxx/include/__cxx03/random +++ b/libcxx/include/__cxx03/random @@ -13,7 +13,7 @@ /* random synopsis -#include +#include <__cxx03/initializer_list> namespace std { @@ -1677,66 +1677,66 @@ class piecewise_linear_distribution } // std */ -#include <__config> -#include <__random/bernoulli_distribution.h> -#include <__random/binomial_distribution.h> -#include <__random/cauchy_distribution.h> -#include <__random/chi_squared_distribution.h> -#include <__random/default_random_engine.h> -#include <__random/discard_block_engine.h> -#include <__random/discrete_distribution.h> -#include <__random/exponential_distribution.h> -#include <__random/extreme_value_distribution.h> -#include <__random/fisher_f_distribution.h> -#include <__random/gamma_distribution.h> -#include <__random/generate_canonical.h> -#include <__random/geometric_distribution.h> -#include <__random/independent_bits_engine.h> -#include <__random/is_seed_sequence.h> -#include <__random/knuth_b.h> -#include <__random/linear_congruential_engine.h> -#include <__random/lognormal_distribution.h> -#include <__random/mersenne_twister_engine.h> -#include <__random/negative_binomial_distribution.h> -#include <__random/normal_distribution.h> -#include <__random/piecewise_constant_distribution.h> -#include <__random/piecewise_linear_distribution.h> -#include <__random/poisson_distribution.h> -#include <__random/random_device.h> -#include <__random/ranlux.h> -#include <__random/seed_seq.h> -#include <__random/shuffle_order_engine.h> -#include <__random/student_t_distribution.h> -#include <__random/subtract_with_carry_engine.h> -#include <__random/uniform_int_distribution.h> -#include <__random/uniform_random_bit_generator.h> -#include <__random/uniform_real_distribution.h> -#include <__random/weibull_distribution.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__random/bernoulli_distribution.h> +#include <__cxx03/__random/binomial_distribution.h> +#include <__cxx03/__random/cauchy_distribution.h> +#include <__cxx03/__random/chi_squared_distribution.h> +#include <__cxx03/__random/default_random_engine.h> +#include <__cxx03/__random/discard_block_engine.h> +#include <__cxx03/__random/discrete_distribution.h> +#include <__cxx03/__random/exponential_distribution.h> +#include <__cxx03/__random/extreme_value_distribution.h> +#include <__cxx03/__random/fisher_f_distribution.h> +#include <__cxx03/__random/gamma_distribution.h> +#include <__cxx03/__random/generate_canonical.h> +#include <__cxx03/__random/geometric_distribution.h> +#include <__cxx03/__random/independent_bits_engine.h> +#include <__cxx03/__random/is_seed_sequence.h> +#include <__cxx03/__random/knuth_b.h> +#include <__cxx03/__random/linear_congruential_engine.h> +#include <__cxx03/__random/lognormal_distribution.h> +#include <__cxx03/__random/mersenne_twister_engine.h> +#include <__cxx03/__random/negative_binomial_distribution.h> +#include <__cxx03/__random/normal_distribution.h> +#include <__cxx03/__random/piecewise_constant_distribution.h> +#include <__cxx03/__random/piecewise_linear_distribution.h> +#include <__cxx03/__random/poisson_distribution.h> +#include <__cxx03/__random/random_device.h> +#include <__cxx03/__random/ranlux.h> +#include <__cxx03/__random/seed_seq.h> +#include <__cxx03/__random/shuffle_order_engine.h> +#include <__cxx03/__random/student_t_distribution.h> +#include <__cxx03/__random/subtract_with_carry_engine.h> +#include <__cxx03/__random/uniform_int_distribution.h> +#include <__cxx03/__random/uniform_random_bit_generator.h> +#include <__cxx03/__random/uniform_real_distribution.h> +#include <__cxx03/__random/weibull_distribution.h> +#include <__cxx03/version> // standard-mandated includes // [rand.synopsis] -#include +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/climits> +# include <__cxx03/cmath> +# include <__cxx03/concepts> +# include <__cxx03/cstddef> +# include <__cxx03/cstdint> +# include <__cxx03/cstdlib> +# include <__cxx03/iosfwd> +# include <__cxx03/limits> +# include <__cxx03/numeric> +# include <__cxx03/string> +# include <__cxx03/type_traits> +# include <__cxx03/vector> #endif #endif // _LIBCPP_RANDOM diff --git a/libcxx/include/__cxx03/ranges b/libcxx/include/__cxx03/ranges index fa35874265de6..3c3021e30d4a5 100644 --- a/libcxx/include/__cxx03/ranges +++ b/libcxx/include/__cxx03/ranges @@ -12,9 +12,9 @@ /* -#include // see [compare.syn] -#include // see [initializer.list.syn] -#include // see [iterator.synopsis] +#include <__cxx03/compare> // see [compare.syn] +#include <__cxx03/initializer_list> // see [initializer.list.syn] +#include <__cxx03/iterator> // see [iterator.synopsis] namespace std::ranges { inline namespace unspecified { @@ -380,84 +380,84 @@ namespace std { } */ -#include <__config> +#include <__cxx03/__config> #if _LIBCPP_STD_VER >= 20 -# include <__ranges/access.h> -# include <__ranges/all.h> -# include <__ranges/common_view.h> -# include <__ranges/concepts.h> -# include <__ranges/counted.h> -# include <__ranges/dangling.h> -# include <__ranges/data.h> -# include <__ranges/drop_view.h> -# include <__ranges/drop_while_view.h> -# include <__ranges/elements_view.h> -# include <__ranges/empty.h> -# include <__ranges/empty_view.h> -# include <__ranges/enable_borrowed_range.h> -# include <__ranges/enable_view.h> -# include <__ranges/filter_view.h> -# include <__ranges/iota_view.h> -# include <__ranges/join_view.h> -# include <__ranges/lazy_split_view.h> -# include <__ranges/rbegin.h> -# include <__ranges/ref_view.h> -# include <__ranges/rend.h> -# include <__ranges/reverse_view.h> -# include <__ranges/single_view.h> -# include <__ranges/size.h> -# include <__ranges/split_view.h> -# include <__ranges/subrange.h> -# include <__ranges/take_view.h> -# include <__ranges/take_while_view.h> -# include <__ranges/transform_view.h> -# include <__ranges/view_interface.h> -# include <__ranges/views.h> +# include <__cxx03/__ranges/access.h> +# include <__cxx03/__ranges/all.h> +# include <__cxx03/__ranges/common_view.h> +# include <__cxx03/__ranges/concepts.h> +# include <__cxx03/__ranges/counted.h> +# include <__cxx03/__ranges/dangling.h> +# include <__cxx03/__ranges/data.h> +# include <__cxx03/__ranges/drop_view.h> +# include <__cxx03/__ranges/drop_while_view.h> +# include <__cxx03/__ranges/elements_view.h> +# include <__cxx03/__ranges/empty.h> +# include <__cxx03/__ranges/empty_view.h> +# include <__cxx03/__ranges/enable_borrowed_range.h> +# include <__cxx03/__ranges/enable_view.h> +# include <__cxx03/__ranges/filter_view.h> +# include <__cxx03/__ranges/iota_view.h> +# include <__cxx03/__ranges/join_view.h> +# include <__cxx03/__ranges/lazy_split_view.h> +# include <__cxx03/__ranges/rbegin.h> +# include <__cxx03/__ranges/ref_view.h> +# include <__cxx03/__ranges/rend.h> +# include <__cxx03/__ranges/reverse_view.h> +# include <__cxx03/__ranges/single_view.h> +# include <__cxx03/__ranges/size.h> +# include <__cxx03/__ranges/split_view.h> +# include <__cxx03/__ranges/subrange.h> +# include <__cxx03/__ranges/take_view.h> +# include <__cxx03/__ranges/take_while_view.h> +# include <__cxx03/__ranges/transform_view.h> +# include <__cxx03/__ranges/view_interface.h> +# include <__cxx03/__ranges/views.h> # if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include <__ranges/istream_view.h> +# include <__cxx03/__ranges/istream_view.h> # endif #endif #if _LIBCPP_STD_VER >= 23 -# include <__ranges/as_rvalue_view.h> -# include <__ranges/chunk_by_view.h> -# include <__ranges/from_range.h> -# include <__ranges/repeat_view.h> -# include <__ranges/to.h> -# include <__ranges/zip_view.h> +# include <__cxx03/__ranges/as_rvalue_view.h> +# include <__cxx03/__ranges/chunk_by_view.h> +# include <__cxx03/__ranges/from_range.h> +# include <__cxx03/__ranges/repeat_view.h> +# include <__cxx03/__ranges/to.h> +# include <__cxx03/__ranges/zip_view.h> #endif -#include +#include <__cxx03/version> // standard-mandated includes // [ranges.syn] -#include -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> +#include <__cxx03/iterator> // [tuple.helper] -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_size.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_size.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include -# include -# include -# include -# include +# include <__cxx03/cstddef> +# include <__cxx03/limits> +# include <__cxx03/optional> +# include <__cxx03/span> +# include <__cxx03/tuple> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/cstdlib> +# include <__cxx03/iosfwd> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_RANGES diff --git a/libcxx/include/__cxx03/ratio b/libcxx/include/__cxx03/ratio index b989c272aaee6..e167b36b7dd17 100644 --- a/libcxx/include/__cxx03/ratio +++ b/libcxx/include/__cxx03/ratio @@ -81,18 +81,18 @@ using quetta = ratio <1'000'000'000'000'000'000'000'000'000'000, 1>; // Since C+ } */ -#include <__config> -#include <__type_traits/integral_constant.h> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/climits> +#include <__cxx03/cstdint> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -511,7 +511,7 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_RATIO diff --git a/libcxx/include/__cxx03/regex b/libcxx/include/__cxx03/regex index b814135121321..9d95db45d419d 100644 --- a/libcxx/include/__cxx03/regex +++ b/libcxx/include/__cxx03/regex @@ -13,8 +13,8 @@ /* regex synopsis -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> namespace std { @@ -789,46 +789,46 @@ typedef regex_token_iterator wsregex_token_iterator; } // std */ -#include <__algorithm/find.h> -#include <__algorithm/search.h> -#include <__assert> -#include <__config> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/default_sentinel.h> -#include <__iterator/wrap_iter.h> -#include <__locale> -#include <__memory/shared_ptr.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__type_traits/is_swappable.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include <__utility/swap.h> -#include <__verbose_abort> -#include -#include -#include -#include -#include +#include <__cxx03/__algorithm/find.h> +#include <__cxx03/__algorithm/search.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/default_sentinel.h> +#include <__cxx03/__iterator/wrap_iter.h> +#include <__cxx03/__locale> +#include <__cxx03/__memory/shared_ptr.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/deque> +#include <__cxx03/stdexcept> +#include <__cxx03/string> +#include <__cxx03/vector> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [re.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #define _LIBCPP_REGEX_COMPLEXITY_FACTOR 4096 @@ -5821,16 +5821,16 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/iosfwd> +# include <__cxx03/iterator> +# include <__cxx03/mutex> +# include <__cxx03/new> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> #endif #endif // _LIBCPP_REGEX diff --git a/libcxx/include/__cxx03/scoped_allocator b/libcxx/include/__cxx03/scoped_allocator index a49ff465b1d55..6c62741ca2c8d 100644 --- a/libcxx/include/__cxx03/scoped_allocator +++ b/libcxx/include/__cxx03/scoped_allocator @@ -109,28 +109,28 @@ template */ -#include <__config> -#include <__memory/allocator_traits.h> -#include <__memory/uses_allocator_construction.h> -#include <__type_traits/common_type.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/remove_reference.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include <__utility/piecewise_construct.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/uses_allocator_construction.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/piecewise_construct.h> +#include <__cxx03/tuple> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -557,17 +557,17 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/climits> +# include <__cxx03/concepts> +# include <__cxx03/cstring> +# include <__cxx03/ctime> +# include <__cxx03/iterator> +# include <__cxx03/memory> +# include <__cxx03/ratio> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> +# include <__cxx03/variant> #endif #endif // _LIBCPP_SCOPED_ALLOCATOR diff --git a/libcxx/include/__cxx03/semaphore b/libcxx/include/__cxx03/semaphore index 95a4375f21c17..e8f9f072bb662 100644 --- a/libcxx/include/__cxx03/semaphore +++ b/libcxx/include/__cxx03/semaphore @@ -45,28 +45,28 @@ using binary_semaphore = counting_semaphore<1>; */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include <__assert> -# include <__atomic/atomic_base.h> -# include <__atomic/atomic_sync.h> -# include <__atomic/memory_order.h> -# include <__chrono/time_point.h> -# include <__thread/poll_with_backoff.h> -# include <__thread/support.h> -# include <__thread/timed_backoff_policy.h> -# include -# include -# include +# include <__cxx03/__assert> +# include <__cxx03/__atomic/atomic_base.h> +# include <__cxx03/__atomic/atomic_sync.h> +# include <__cxx03/__atomic/memory_order.h> +# include <__cxx03/__chrono/time_point.h> +# include <__cxx03/__thread/poll_with_backoff.h> +# include <__cxx03/__thread/support.h> +# include <__cxx03/__thread/timed_backoff_policy.h> +# include <__cxx03/cstddef> +# include <__cxx03/limits> +# include <__cxx03/version> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> # if _LIBCPP_STD_VER >= 14 @@ -182,7 +182,7 @@ _LIBCPP_POP_MACROS #endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/atomic> #endif #endif //_LIBCPP_SEMAPHORE diff --git a/libcxx/include/__cxx03/set b/libcxx/include/__cxx03/set index 7e9661a0149ab..b7312fff21395 100644 --- a/libcxx/include/__cxx03/set +++ b/libcxx/include/__cxx03/set @@ -512,47 +512,47 @@ erase_if(multiset& c, Predicate pred); // C++20 */ -#include <__algorithm/equal.h> -#include <__algorithm/lexicographical_compare.h> -#include <__algorithm/lexicographical_compare_three_way.h> -#include <__assert> -#include <__config> -#include <__functional/is_transparent.h> -#include <__functional/operations.h> -#include <__iterator/erase_if_container.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/ranges_iterator_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__memory/allocator.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__node_handle> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__tree> -#include <__type_traits/is_allocator.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/lexicographical_compare.h> +#include <__cxx03/__algorithm/lexicographical_compare_three_way.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/is_transparent.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__iterator/erase_if_container.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/ranges_iterator_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__node_handle> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__tree> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [associative.set.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -1489,12 +1489,12 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/functional> +# include <__cxx03/iterator> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_SET diff --git a/libcxx/include/__cxx03/shared_mutex b/libcxx/include/__cxx03/shared_mutex index f63bd25493878..99f77a8921005 100644 --- a/libcxx/include/__cxx03/shared_mutex +++ b/libcxx/include/__cxx03/shared_mutex @@ -122,25 +122,25 @@ template */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include <__chrono/duration.h> -# include <__chrono/steady_clock.h> -# include <__chrono/time_point.h> -# include <__condition_variable/condition_variable.h> -# include <__memory/addressof.h> -# include <__mutex/mutex.h> -# include <__mutex/tag_types.h> -# include <__mutex/unique_lock.h> -# include <__system_error/system_error.h> -# include <__utility/swap.h> -# include -# include +# include <__cxx03/__chrono/duration.h> +# include <__cxx03/__chrono/steady_clock.h> +# include <__cxx03/__chrono/time_point.h> +# include <__cxx03/__condition_variable/condition_variable.h> +# include <__cxx03/__memory/addressof.h> +# include <__cxx03/__mutex/mutex.h> +# include <__cxx03/__mutex/tag_types.h> +# include <__cxx03/__mutex/unique_lock.h> +# include <__cxx03/__system_error/system_error.h> +# include <__cxx03/__utility/swap.h> +# include <__cxx03/cerrno> +# include <__cxx03/version> _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> # if _LIBCPP_STD_VER >= 14 @@ -458,7 +458,7 @@ _LIBCPP_POP_MACROS #endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/system_error> #endif #endif // _LIBCPP_SHARED_MUTEX diff --git a/libcxx/include/__cxx03/source_location b/libcxx/include/__cxx03/source_location index d16e3c46fce55..509a547808821 100644 --- a/libcxx/include/__cxx03/source_location +++ b/libcxx/include/__cxx03/source_location @@ -25,9 +25,9 @@ namespace std { } */ -#include <__config> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/cstdint> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/span b/libcxx/include/__cxx03/span index da631cdc3f90e..d09f96468064b 100644 --- a/libcxx/include/__cxx03/span +++ b/libcxx/include/__cxx03/span @@ -144,55 +144,55 @@ template */ -#include <__assert> -#include <__concepts/convertible_to.h> -#include <__concepts/equality_comparable.h> -#include <__config> -#include <__fwd/array.h> -#include <__fwd/span.h> -#include <__iterator/bounded_iter.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__iterator/wrap_iter.h> -#include <__memory/pointer_traits.h> -#include <__ranges/concepts.h> -#include <__ranges/data.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/enable_view.h> -#include <__ranges/size.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_same.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__type_traits/type_identity.h> -#include <__utility/forward.h> -#include // for byte -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__concepts/convertible_to.h> +#include <__cxx03/__concepts/equality_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/array.h> +#include <__cxx03/__fwd/span.h> +#include <__cxx03/__iterator/bounded_iter.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__iterator/wrap_iter.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/data.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/enable_view.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/cstddef> // for byte +#include <__cxx03/initializer_list> +#include <__cxx03/stdexcept> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -626,11 +626,11 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include +# include <__cxx03/array> +# include <__cxx03/concepts> +# include <__cxx03/functional> +# include <__cxx03/iterator> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_SPAN diff --git a/libcxx/include/__cxx03/sstream b/libcxx/include/__cxx03/sstream index 9ba43ffeb850f..2667ae9cb18e1 100644 --- a/libcxx/include/__cxx03/sstream +++ b/libcxx/include/__cxx03/sstream @@ -312,22 +312,22 @@ typedef basic_stringstream wstringstream; // clang-format on -#include <__config> -#include <__fwd/sstream.h> -#include <__ostream/basic_ostream.h> -#include <__type_traits/is_convertible.h> -#include <__utility/swap.h> -#include -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__fwd/sstream.h> +#include <__cxx03/__ostream/basic_ostream.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/istream> +#include <__cxx03/string> +#include <__cxx03/string_view> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -1269,8 +1269,8 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if _LIBCPP_STD_VER <= 20 && !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) -# include -# include +# include <__cxx03/ostream> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_SSTREAM diff --git a/libcxx/include/__cxx03/stack b/libcxx/include/__cxx03/stack index 90f8933cca308..a868878653598 100644 --- a/libcxx/include/__cxx03/stack +++ b/libcxx/include/__cxx03/stack @@ -113,33 +113,33 @@ template */ -#include <__algorithm/ranges_copy.h> -#include <__config> -#include <__fwd/stack.h> -#include <__iterator/back_insert_iterator.h> -#include <__iterator/iterator_traits.h> -#include <__memory/uses_allocator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__type_traits/is_same.h> -#include <__utility/forward.h> -#include -#include +#include <__cxx03/__algorithm/ranges_copy.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/stack.h> +#include <__cxx03/__iterator/back_insert_iterator.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__memory/uses_allocator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/deque> +#include <__cxx03/version> // standard-mandated includes // [stack.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -371,9 +371,9 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/functional> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_STACK diff --git a/libcxx/include/__cxx03/stdatomic.h b/libcxx/include/__cxx03/stdatomic.h index 79772eb7fce1f..15f68b93a414f 100644 --- a/libcxx/include/__cxx03/stdatomic.h +++ b/libcxx/include/__cxx03/stdatomic.h @@ -115,7 +115,7 @@ using std::atomic_signal_fence // see below */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -123,8 +123,8 @@ using std::atomic_signal_fence // see below #if defined(__cplusplus) && _LIBCPP_STD_VER >= 23 -# include -# include +# include <__cxx03/atomic> +# include <__cxx03/version> # ifdef _Atomic # undef _Atomic diff --git a/libcxx/include/__cxx03/stdbool.h b/libcxx/include/__cxx03/stdbool.h index e74d91f459459..0444340e24412 100644 --- a/libcxx/include/__cxx03/stdbool.h +++ b/libcxx/include/__cxx03/stdbool.h @@ -19,7 +19,7 @@ */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/stddef.h b/libcxx/include/__cxx03/stddef.h index 1583e78e3739b..38c15a3f2f247 100644 --- a/libcxx/include/__cxx03/stddef.h +++ b/libcxx/include/__cxx03/stddef.h @@ -24,7 +24,7 @@ */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/stdexcept b/libcxx/include/__cxx03/stdexcept index 853c185187c77..cd035f6df86db 100644 --- a/libcxx/include/__cxx03/stdexcept +++ b/libcxx/include/__cxx03/stdexcept @@ -41,10 +41,10 @@ public: */ -#include <__config> -#include <__exception/exception.h> -#include <__fwd/string.h> -#include <__verbose_abort> +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__fwd/string.h> +#include <__cxx03/__verbose_abort> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -278,9 +278,9 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_underflow_error(const _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/cstdlib> +# include <__cxx03/exception> +# include <__cxx03/iosfwd> #endif #endif // _LIBCPP_STDEXCEPT diff --git a/libcxx/include/__cxx03/stdint.h b/libcxx/include/__cxx03/stdint.h index 35e5b8cbdad26..5e845a7a517a6 100644 --- a/libcxx/include/__cxx03/stdint.h +++ b/libcxx/include/__cxx03/stdint.h @@ -103,7 +103,7 @@ */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/stdio.h b/libcxx/include/__cxx03/stdio.h index 3aa559393f185..214d9481bf4c6 100644 --- a/libcxx/include/__cxx03/stdio.h +++ b/libcxx/include/__cxx03/stdio.h @@ -98,7 +98,7 @@ int ferror(FILE* stream); void perror(const char* s); */ -# include <__config> +# include <__cxx03/__config> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/stdlib.h b/libcxx/include/__cxx03/stdlib.h index a74344d49150c..77c85705e1911 100644 --- a/libcxx/include/__cxx03/stdlib.h +++ b/libcxx/include/__cxx03/stdlib.h @@ -84,7 +84,7 @@ void *aligned_alloc(size_t alignment, size_t size); // C11 */ -# include <__config> +# include <__cxx03/__config> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/stop_token b/libcxx/include/__cxx03/stop_token index d4e651d9541f4..7173162cf66af 100644 --- a/libcxx/include/__cxx03/stop_token +++ b/libcxx/include/__cxx03/stop_token @@ -31,17 +31,17 @@ namespace std { */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_THREADS) # if _LIBCPP_STD_VER >= 20 -# include <__stop_token/stop_callback.h> -# include <__stop_token/stop_source.h> -# include <__stop_token/stop_token.h> +# include <__cxx03/__stop_token/stop_callback.h> +# include <__cxx03/__stop_token/stop_source.h> +# include <__cxx03/__stop_token/stop_token.h> # endif -# include +# include <__cxx03/version> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -50,7 +50,7 @@ namespace std { #endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/iosfwd> #endif #endif // _LIBCPP_STOP_TOKEN diff --git a/libcxx/include/__cxx03/streambuf b/libcxx/include/__cxx03/streambuf index 5a3c17ef7c99e..9f60d38b4731f 100644 --- a/libcxx/include/__cxx03/streambuf +++ b/libcxx/include/__cxx03/streambuf @@ -107,23 +107,23 @@ protected: */ -#include <__assert> -#include <__config> -#include <__fwd/streambuf.h> -#include <__locale> -#include <__type_traits/is_same.h> -#include <__utility/is_valid_range.h> -#include -#include -#include -#include +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__fwd/streambuf.h> +#include <__cxx03/__locale> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__utility/is_valid_range.h> +#include <__cxx03/climits> +#include <__cxx03/ios> +#include <__cxx03/iosfwd> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -439,7 +439,7 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/cstdint> #endif #endif // _LIBCPP_STREAMBUF diff --git a/libcxx/include/__cxx03/string b/libcxx/include/__cxx03/string index 08d66803f8a1a..2f2f623440870 100644 --- a/libcxx/include/__cxx03/string +++ b/libcxx/include/__cxx03/string @@ -15,8 +15,8 @@ /* string synopsis -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> namespace std { @@ -586,90 +586,90 @@ basic_string operator""s( const char32_t *str, size_t len ); // clang-format on -#include <__algorithm/max.h> -#include <__algorithm/min.h> -#include <__algorithm/remove.h> -#include <__algorithm/remove_if.h> -#include <__assert> -#include <__config> -#include <__debug_utils/sanitizers.h> -#include <__format/enable_insertable.h> -#include <__functional/hash.h> -#include <__functional/unary_function.h> -#include <__fwd/string.h> -#include <__ios/fpos.h> -#include <__iterator/bounded_iter.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__iterator/wrap_iter.h> -#include <__memory/addressof.h> -#include <__memory/allocate_at_least.h> -#include <__memory/allocator.h> -#include <__memory/allocator_traits.h> -#include <__memory/compressed_pair.h> -#include <__memory/construct_at.h> -#include <__memory/pointer_traits.h> -#include <__memory/swap_allocator.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__ranges/size.h> -#include <__string/char_traits.h> -#include <__string/extern_template_lists.h> -#include <__type_traits/conditional.h> -#include <__type_traits/is_allocator.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_standard_layout.h> -#include <__type_traits/is_trivial.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/noexcept_move_assign_container.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/void_t.h> -#include <__utility/auto_cast.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/is_pointer_in_range.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include <__utility/unreachable.h> -#include -#include // EOF -#include -#include -#include -#include -#include +#include <__cxx03/__algorithm/max.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/remove.h> +#include <__cxx03/__algorithm/remove_if.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__debug_utils/sanitizers.h> +#include <__cxx03/__format/enable_insertable.h> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__fwd/string.h> +#include <__cxx03/__ios/fpos.h> +#include <__cxx03/__iterator/bounded_iter.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__iterator/wrap_iter.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocate_at_least.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/compressed_pair.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/swap_allocator.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__string/char_traits.h> +#include <__cxx03/__string/extern_template_lists.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_standard_layout.h> +#include <__cxx03/__type_traits/is_trivial.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/noexcept_move_assign_container.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/auto_cast.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/is_pointer_in_range.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/__utility/unreachable.h> +#include <__cxx03/climits> +#include <__cxx03/cstdio> // EOF +#include <__cxx03/cstring> +#include <__cxx03/limits> +#include <__cxx03/stdexcept> +#include <__cxx03/string_view> +#include <__cxx03/version> #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS -# include +# include <__cxx03/cwchar> #endif // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [string.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN) # define _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS __attribute__((__no_sanitize__("address"))) @@ -4339,14 +4339,14 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/iterator> +# include <__cxx03/new> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> #endif #endif // _LIBCPP_STRING diff --git a/libcxx/include/__cxx03/string.h b/libcxx/include/__cxx03/string.h index ae10d5ad2c76f..e1dc4cbd8f29e 100644 --- a/libcxx/include/__cxx03/string.h +++ b/libcxx/include/__cxx03/string.h @@ -51,7 +51,7 @@ size_t strlen(const char* s); */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/string_view b/libcxx/include/__cxx03/string_view index 72dbf0bfa8e54..8b21ecb3d9a8b 100644 --- a/libcxx/include/__cxx03/string_view +++ b/libcxx/include/__cxx03/string_view @@ -16,7 +16,7 @@ string_view synopsis -#include +#include <__cxx03/compare> namespace std { @@ -205,57 +205,57 @@ namespace std { // clang-format on -#include <__algorithm/min.h> -#include <__assert> -#include <__config> -#include <__functional/hash.h> -#include <__functional/unary_function.h> -#include <__fwd/ostream.h> -#include <__fwd/string_view.h> -#include <__iterator/bounded_iter.h> -#include <__iterator/concepts.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__iterator/wrap_iter.h> -#include <__memory/pointer_traits.h> -#include <__ranges/concepts.h> -#include <__ranges/data.h> -#include <__ranges/enable_borrowed_range.h> -#include <__ranges/enable_view.h> -#include <__ranges/size.h> -#include <__string/char_traits.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_standard_layout.h> -#include <__type_traits/is_trivial.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__type_traits/type_identity.h> -#include -#include -#include -#include -#include +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__fwd/ostream.h> +#include <__cxx03/__fwd/string_view.h> +#include <__cxx03/__iterator/bounded_iter.h> +#include <__cxx03/__iterator/concepts.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__iterator/wrap_iter.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/data.h> +#include <__cxx03/__ranges/enable_borrowed_range.h> +#include <__cxx03/__ranges/enable_view.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__string/char_traits.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_standard_layout.h> +#include <__cxx03/__type_traits/is_trivial.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/cstddef> +#include <__cxx03/iosfwd> +#include <__cxx03/limits> +#include <__cxx03/stdexcept> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [string.view.synop] -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -938,11 +938,11 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/iterator> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_STRING_VIEW diff --git a/libcxx/include/__cxx03/strstream b/libcxx/include/__cxx03/strstream index 9ff4024a7c7e2..80e166acb7515 100644 --- a/libcxx/include/__cxx03/strstream +++ b/libcxx/include/__cxx03/strstream @@ -129,10 +129,10 @@ private: */ -#include <__config> -#include -#include -#include +#include <__cxx03/__config> +#include <__cxx03/istream> +#include <__cxx03/ostream> +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -141,7 +141,7 @@ private: #if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || defined(_LIBCPP_BUILDING_LIBRARY) _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/syncstream b/libcxx/include/__cxx03/syncstream index e6f35b6f428ed..4e67b43f3ccb9 100644 --- a/libcxx/include/__cxx03/syncstream +++ b/libcxx/include/__cxx03/syncstream @@ -13,7 +13,7 @@ /* syncstream synopsis -#include // see [ostream.syn] +#include <__cxx03/ostream> // see [ostream.syn] namespace std { template @@ -115,30 +115,30 @@ namespace std { */ -#include <__config> -#include <__utility/move.h> -#include -#include // required for declaration of default arguments -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__utility/move.h> +#include <__cxx03/ios> +#include <__cxx03/iosfwd> // required for declaration of default arguments +#include <__cxx03/streambuf> +#include <__cxx03/string> #ifndef _LIBCPP_HAS_NO_THREADS -# include -# include -# include +# include <__cxx03/map> +# include <__cxx03/mutex> +# include <__cxx03/shared_mutex> #endif // standard-mandated includes // [syncstream.syn] -#include +#include <__cxx03/ostream> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__cxx03/system_error b/libcxx/include/__cxx03/system_error index eeab347788a9a..eb5a2dc4532e5 100644 --- a/libcxx/include/__cxx03/system_error +++ b/libcxx/include/__cxx03/system_error @@ -144,28 +144,28 @@ template <> struct hash; */ -#include <__config> -#include <__system_error/errc.h> -#include <__system_error/error_category.h> -#include <__system_error/error_code.h> -#include <__system_error/error_condition.h> -#include <__system_error/system_error.h> -#include +#include <__cxx03/__config> +#include <__cxx03/__system_error/errc.h> +#include <__cxx03/__system_error/error_category.h> +#include <__cxx03/__system_error/error_code.h> +#include <__cxx03/__system_error/error_condition.h> +#include <__cxx03/__system_error/system_error.h> +#include <__cxx03/version> // standard-mandated includes // [system.error.syn] -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include +# include <__cxx03/cstdint> +# include <__cxx03/cstring> +# include <__cxx03/limits> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_SYSTEM_ERROR diff --git a/libcxx/include/__cxx03/tgmath.h b/libcxx/include/__cxx03/tgmath.h index e6f0a4ab2611f..52637d1eaa048 100644 --- a/libcxx/include/__cxx03/tgmath.h +++ b/libcxx/include/__cxx03/tgmath.h @@ -13,18 +13,18 @@ /* tgmath.h synopsis -#include +#include <__cxx03/ctgmath> */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #ifdef __cplusplus -# include +# include <__cxx03/ctgmath> #else # if __has_include_next() # include_next diff --git a/libcxx/include/__cxx03/thread b/libcxx/include/__cxx03/thread index 25cb7ce6d7231..f6d9cc8096a74 100644 --- a/libcxx/include/__cxx03/thread +++ b/libcxx/include/__cxx03/thread @@ -86,21 +86,21 @@ void sleep_for(const chrono::duration& rel_time); */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_THREADS) -# include <__thread/formatter.h> -# include <__thread/jthread.h> -# include <__thread/support.h> -# include <__thread/this_thread.h> -# include <__thread/thread.h> -# include +# include <__cxx03/__thread/formatter.h> +# include <__cxx03/__thread/jthread.h> +# include <__cxx03/__thread/support.h> +# include <__cxx03/__thread/this_thread.h> +# include <__cxx03/__thread/thread.h> +# include <__cxx03/version> // standard-mandated includes // [thread.syn] -# include +# include <__cxx03/compare> # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -109,22 +109,22 @@ void sleep_for(const chrono::duration& rel_time); #endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) -# include -# include -# include -# include +# include <__cxx03/cstddef> +# include <__cxx03/ctime> +# include <__cxx03/iosfwd> +# include <__cxx03/ratio> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 -# include +# include <__cxx03/chrono> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include +# include <__cxx03/cstring> +# include <__cxx03/functional> +# include <__cxx03/new> +# include <__cxx03/system_error> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_THREAD diff --git a/libcxx/include/__cxx03/tuple b/libcxx/include/__cxx03/tuple index 5161c2aa97c2b..72375ad6c0abc 100644 --- a/libcxx/include/__cxx03/tuple +++ b/libcxx/include/__cxx03/tuple @@ -210,69 +210,69 @@ template // clang-format on -#include <__compare/common_comparison_category.h> -#include <__compare/synth_three_way.h> -#include <__config> -#include <__functional/invoke.h> -#include <__fwd/array.h> -#include <__fwd/pair.h> -#include <__fwd/tuple.h> -#include <__memory/allocator_arg_t.h> -#include <__memory/uses_allocator.h> -#include <__tuple/find_index.h> -#include <__tuple/ignore.h> -#include <__tuple/make_tuple_types.h> -#include <__tuple/sfinae_helpers.h> -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_indices.h> -#include <__tuple/tuple_like_ext.h> -#include <__tuple/tuple_size.h> -#include <__tuple/tuple_types.h> -#include <__type_traits/common_reference.h> -#include <__type_traits/common_type.h> -#include <__type_traits/conditional.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/copy_cvref.h> -#include <__type_traits/disjunction.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_empty.h> -#include <__type_traits/is_final.h> -#include <__type_traits/is_implicitly_default_constructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_swappable.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/lazy.h> -#include <__type_traits/maybe_const.h> -#include <__type_traits/nat.h> -#include <__type_traits/negation.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/remove_reference.h> -#include <__type_traits/unwrap_ref.h> -#include <__utility/forward.h> -#include <__utility/integer_sequence.h> -#include <__utility/move.h> -#include <__utility/piecewise_construct.h> -#include <__utility/swap.h> -#include -#include +#include <__cxx03/__compare/common_comparison_category.h> +#include <__cxx03/__compare/synth_three_way.h> +#include <__cxx03/__config> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__fwd/array.h> +#include <__cxx03/__fwd/pair.h> +#include <__cxx03/__fwd/tuple.h> +#include <__cxx03/__memory/allocator_arg_t.h> +#include <__cxx03/__memory/uses_allocator.h> +#include <__cxx03/__tuple/find_index.h> +#include <__cxx03/__tuple/ignore.h> +#include <__cxx03/__tuple/make_tuple_types.h> +#include <__cxx03/__tuple/sfinae_helpers.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_indices.h> +#include <__cxx03/__tuple/tuple_like_ext.h> +#include <__cxx03/__tuple/tuple_size.h> +#include <__cxx03/__tuple/tuple_types.h> +#include <__cxx03/__type_traits/common_reference.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/copy_cvref.h> +#include <__cxx03/__type_traits/disjunction.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_empty.h> +#include <__cxx03/__type_traits/is_final.h> +#include <__cxx03/__type_traits/is_implicitly_default_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_swappable.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/lazy.h> +#include <__cxx03/__type_traits/maybe_const.h> +#include <__cxx03/__type_traits/nat.h> +#include <__cxx03/__type_traits/negation.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__type_traits/unwrap_ref.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/piecewise_construct.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cstddef> +#include <__cxx03/version> // standard-mandated includes // [tuple.syn] -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -1408,12 +1408,12 @@ _LIBCPP_POP_MACROS // clang-format on #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/exception> +# include <__cxx03/iosfwd> +# include <__cxx03/new> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> #endif #endif // _LIBCPP_TUPLE diff --git a/libcxx/include/__cxx03/type_traits b/libcxx/include/__cxx03/type_traits index 7f231cd09df51..484441cbb6700 100644 --- a/libcxx/include/__cxx03/type_traits +++ b/libcxx/include/__cxx03/type_traits @@ -417,104 +417,104 @@ namespace std */ -#include <__config> -#include <__fwd/functional.h> // This is https://llvm.org/PR56938 -#include <__type_traits/add_const.h> -#include <__type_traits/add_cv.h> -#include <__type_traits/add_lvalue_reference.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/add_volatile.h> -#include <__type_traits/aligned_storage.h> -#include <__type_traits/aligned_union.h> -#include <__type_traits/alignment_of.h> -#include <__type_traits/common_type.h> -#include <__type_traits/conditional.h> -#include <__type_traits/decay.h> -#include <__type_traits/enable_if.h> -#include <__type_traits/extent.h> -#include <__type_traits/has_virtual_destructor.h> -#include <__type_traits/integral_constant.h> -#include <__type_traits/is_abstract.h> -#include <__type_traits/is_arithmetic.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_assignable.h> -#include <__type_traits/is_base_of.h> -#include <__type_traits/is_class.h> -#include <__type_traits/is_compound.h> -#include <__type_traits/is_const.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_convertible.h> -#include <__type_traits/is_destructible.h> -#include <__type_traits/is_empty.h> -#include <__type_traits/is_enum.h> -#include <__type_traits/is_floating_point.h> -#include <__type_traits/is_function.h> -#include <__type_traits/is_fundamental.h> -#include <__type_traits/is_integral.h> -#include <__type_traits/is_literal_type.h> -#include <__type_traits/is_member_pointer.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_nothrow_destructible.h> -#include <__type_traits/is_object.h> -#include <__type_traits/is_pod.h> -#include <__type_traits/is_pointer.h> -#include <__type_traits/is_polymorphic.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_same.h> -#include <__type_traits/is_scalar.h> -#include <__type_traits/is_signed.h> -#include <__type_traits/is_standard_layout.h> -#include <__type_traits/is_trivial.h> -#include <__type_traits/is_trivially_assignable.h> -#include <__type_traits/is_trivially_constructible.h> -#include <__type_traits/is_trivially_copyable.h> -#include <__type_traits/is_trivially_destructible.h> -#include <__type_traits/is_union.h> -#include <__type_traits/is_unsigned.h> -#include <__type_traits/is_void.h> -#include <__type_traits/is_volatile.h> -#include <__type_traits/make_signed.h> -#include <__type_traits/make_unsigned.h> -#include <__type_traits/rank.h> -#include <__type_traits/remove_all_extents.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_cv.h> -#include <__type_traits/remove_extent.h> -#include <__type_traits/remove_pointer.h> -#include <__type_traits/remove_reference.h> -#include <__type_traits/remove_volatile.h> -#include <__type_traits/result_of.h> -#include <__type_traits/underlying_type.h> +#include <__cxx03/__config> +#include <__cxx03/__fwd/functional.h> // This is https://llvm.org/PR56938 +#include <__cxx03/__type_traits/add_const.h> +#include <__cxx03/__type_traits/add_cv.h> +#include <__cxx03/__type_traits/add_lvalue_reference.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/add_rvalue_reference.h> +#include <__cxx03/__type_traits/add_volatile.h> +#include <__cxx03/__type_traits/aligned_storage.h> +#include <__cxx03/__type_traits/aligned_union.h> +#include <__cxx03/__type_traits/alignment_of.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/conditional.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/enable_if.h> +#include <__cxx03/__type_traits/extent.h> +#include <__cxx03/__type_traits/has_virtual_destructor.h> +#include <__cxx03/__type_traits/integral_constant.h> +#include <__cxx03/__type_traits/is_abstract.h> +#include <__cxx03/__type_traits/is_arithmetic.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_assignable.h> +#include <__cxx03/__type_traits/is_base_of.h> +#include <__cxx03/__type_traits/is_class.h> +#include <__cxx03/__type_traits/is_compound.h> +#include <__cxx03/__type_traits/is_const.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_convertible.h> +#include <__cxx03/__type_traits/is_destructible.h> +#include <__cxx03/__type_traits/is_empty.h> +#include <__cxx03/__type_traits/is_enum.h> +#include <__cxx03/__type_traits/is_floating_point.h> +#include <__cxx03/__type_traits/is_function.h> +#include <__cxx03/__type_traits/is_fundamental.h> +#include <__cxx03/__type_traits/is_integral.h> +#include <__cxx03/__type_traits/is_literal_type.h> +#include <__cxx03/__type_traits/is_member_pointer.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_destructible.h> +#include <__cxx03/__type_traits/is_object.h> +#include <__cxx03/__type_traits/is_pod.h> +#include <__cxx03/__type_traits/is_pointer.h> +#include <__cxx03/__type_traits/is_polymorphic.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_same.h> +#include <__cxx03/__type_traits/is_scalar.h> +#include <__cxx03/__type_traits/is_signed.h> +#include <__cxx03/__type_traits/is_standard_layout.h> +#include <__cxx03/__type_traits/is_trivial.h> +#include <__cxx03/__type_traits/is_trivially_assignable.h> +#include <__cxx03/__type_traits/is_trivially_constructible.h> +#include <__cxx03/__type_traits/is_trivially_copyable.h> +#include <__cxx03/__type_traits/is_trivially_destructible.h> +#include <__cxx03/__type_traits/is_union.h> +#include <__cxx03/__type_traits/is_unsigned.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/is_volatile.h> +#include <__cxx03/__type_traits/make_signed.h> +#include <__cxx03/__type_traits/make_unsigned.h> +#include <__cxx03/__type_traits/rank.h> +#include <__cxx03/__type_traits/remove_all_extents.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_cv.h> +#include <__cxx03/__type_traits/remove_extent.h> +#include <__cxx03/__type_traits/remove_pointer.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__type_traits/remove_volatile.h> +#include <__cxx03/__type_traits/result_of.h> +#include <__cxx03/__type_traits/underlying_type.h> #if _LIBCPP_STD_VER >= 14 -# include <__type_traits/is_final.h> -# include <__type_traits/is_null_pointer.h> +# include <__cxx03/__type_traits/is_final.h> +# include <__cxx03/__type_traits/is_null_pointer.h> #endif #if _LIBCPP_STD_VER >= 17 -# include <__type_traits/conjunction.h> -# include <__type_traits/disjunction.h> -# include <__type_traits/has_unique_object_representation.h> -# include <__type_traits/invoke.h> -# include <__type_traits/is_aggregate.h> -# include <__type_traits/is_swappable.h> -# include <__type_traits/negation.h> -# include <__type_traits/void_t.h> +# include <__cxx03/__type_traits/conjunction.h> +# include <__cxx03/__type_traits/disjunction.h> +# include <__cxx03/__type_traits/has_unique_object_representation.h> +# include <__cxx03/__type_traits/invoke.h> +# include <__cxx03/__type_traits/is_aggregate.h> +# include <__cxx03/__type_traits/is_swappable.h> +# include <__cxx03/__type_traits/negation.h> +# include <__cxx03/__type_traits/void_t.h> #endif #if _LIBCPP_STD_VER >= 20 -# include <__type_traits/common_reference.h> -# include <__type_traits/is_bounded_array.h> -# include <__type_traits/is_constant_evaluated.h> -# include <__type_traits/is_nothrow_convertible.h> -# include <__type_traits/is_unbounded_array.h> -# include <__type_traits/type_identity.h> -# include <__type_traits/unwrap_ref.h> +# include <__cxx03/__type_traits/common_reference.h> +# include <__cxx03/__type_traits/is_bounded_array.h> +# include <__cxx03/__type_traits/is_constant_evaluated.h> +# include <__cxx03/__type_traits/is_nothrow_convertible.h> +# include <__cxx03/__type_traits/is_unbounded_array.h> +# include <__cxx03/__type_traits/type_identity.h> +# include <__cxx03/__type_traits/unwrap_ref.h> #endif -#include +#include <__cxx03/version> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/typeindex b/libcxx/include/__cxx03/typeindex index 6398aa40d616a..e1d6097ebfa61 100644 --- a/libcxx/include/__cxx03/typeindex +++ b/libcxx/include/__cxx03/typeindex @@ -45,13 +45,13 @@ struct hash */ -#include <__config> -#include <__functional/unary_function.h> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/typeinfo> +#include <__cxx03/version> // standard-mandated includes -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -98,9 +98,9 @@ struct _LIBCPP_TEMPLATE_VIS hash : public __unary_function -# include -# include +# include <__cxx03/iosfwd> +# include <__cxx03/new> +# include <__cxx03/utility> #endif #endif // _LIBCPP_TYPEINDEX diff --git a/libcxx/include/__cxx03/typeinfo b/libcxx/include/__cxx03/typeinfo index 2727cad02fa99..823c3cbae1cfb 100644 --- a/libcxx/include/__cxx03/typeinfo +++ b/libcxx/include/__cxx03/typeinfo @@ -56,19 +56,19 @@ public: */ -#include <__config> -#include <__exception/exception.h> -#include <__type_traits/is_constant_evaluated.h> -#include <__verbose_abort> -#include -#include +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__type_traits/is_constant_evaluated.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/cstddef> +#include <__cxx03/cstdint> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if defined(_LIBCPP_ABI_VCRUNTIME) -# include +# include <__cxx03/vcruntime_typeinfo.h> #else namespace std // purposefully not using versioning namespace @@ -382,8 +382,8 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_cast() { _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include +# include <__cxx03/cstdlib> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_TYPEINFO diff --git a/libcxx/include/__cxx03/uchar.h b/libcxx/include/__cxx03/uchar.h index 07b78611406d5..29a10211ad36e 100644 --- a/libcxx/include/__cxx03/uchar.h +++ b/libcxx/include/__cxx03/uchar.h @@ -32,7 +32,7 @@ size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps); */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -42,13 +42,13 @@ size_t c32rtomb(char* s, char32_t c32, mbstate_t* ps); // Some platforms don't implement and we don't want to give a hard // error on those platforms. When the platform doesn't provide , at -// least include so we get the declaration for size_t, and try to +// least include <__cxx03/stddef.h> so we get the declaration for size_t, and try to // get the declaration of mbstate_t too. # if __has_include_next() # include_next # else -# include <__mbstate_t.h> -# include +# include <__cxx03/__mbstate_t.h> +# include <__cxx03/stddef.h> # endif #endif // _LIBCPP_CXX03_LANG diff --git a/libcxx/include/__cxx03/unordered_map b/libcxx/include/__cxx03/unordered_map index 7c31c4fce26b0..cccfc4900895a 100644 --- a/libcxx/include/__cxx03/unordered_map +++ b/libcxx/include/__cxx03/unordered_map @@ -14,7 +14,7 @@ unordered_map synopsis -#include +#include <__cxx03/initializer_list> namespace std { @@ -583,49 +583,49 @@ template */ -#include <__algorithm/is_permutation.h> -#include <__assert> -#include <__config> -#include <__functional/is_transparent.h> -#include <__functional/operations.h> -#include <__hash_table> -#include <__iterator/distance.h> -#include <__iterator/erase_if_container.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/ranges_iterator_traits.h> -#include <__memory/addressof.h> -#include <__memory/allocator.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__node_handle> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__type_traits/is_allocator.h> -#include <__type_traits/type_identity.h> -#include <__utility/forward.h> -#include -#include -#include +#include <__cxx03/__algorithm/is_permutation.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/is_transparent.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__hash_table> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/erase_if_container.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/ranges_iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__node_handle> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/stdexcept> +#include <__cxx03/tuple> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [unord.map.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -2530,12 +2530,12 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/bit> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/iterator> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_UNORDERED_MAP diff --git a/libcxx/include/__cxx03/unordered_set b/libcxx/include/__cxx03/unordered_set index 3297294a893f8..b64d334c129de 100644 --- a/libcxx/include/__cxx03/unordered_set +++ b/libcxx/include/__cxx03/unordered_set @@ -16,7 +16,7 @@ unordered_set synopsis -#include +#include <__cxx03/initializer_list> namespace std { @@ -531,46 +531,46 @@ template // clang-format on -#include <__algorithm/is_permutation.h> -#include <__assert> -#include <__config> -#include <__functional/is_transparent.h> -#include <__functional/operations.h> -#include <__hash_table> -#include <__iterator/distance.h> -#include <__iterator/erase_if_container.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/ranges_iterator_traits.h> -#include <__memory/addressof.h> -#include <__memory/allocator.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__node_handle> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__type_traits/is_allocator.h> -#include <__utility/forward.h> -#include +#include <__cxx03/__algorithm/is_permutation.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/is_transparent.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__hash_table> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/erase_if_container.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/ranges_iterator_traits.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__node_handle> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [unord.set.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -1800,12 +1800,12 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/functional> +# include <__cxx03/iterator> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_UNORDERED_SET diff --git a/libcxx/include/__cxx03/utility b/libcxx/include/__cxx03/utility index f97907fbf72e9..c0734df8db636 100644 --- a/libcxx/include/__cxx03/utility +++ b/libcxx/include/__cxx03/utility @@ -13,7 +13,7 @@ /* utility synopsis -#include +#include <__cxx03/initializer_list> namespace std { @@ -246,64 +246,64 @@ template */ -#include <__config> +#include <__cxx03/__config> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include <__utility/piecewise_construct.h> -#include <__utility/rel_ops.h> -#include <__utility/swap.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/piecewise_construct.h> +#include <__cxx03/__utility/rel_ops.h> +#include <__cxx03/__utility/swap.h> #if _LIBCPP_STD_VER >= 14 -# include <__utility/exchange.h> -# include <__utility/integer_sequence.h> +# include <__cxx03/__utility/exchange.h> +# include <__cxx03/__utility/integer_sequence.h> #endif #if _LIBCPP_STD_VER >= 17 -# include <__utility/as_const.h> -# include <__utility/in_place.h> +# include <__cxx03/__utility/as_const.h> +# include <__cxx03/__utility/in_place.h> #endif #if _LIBCPP_STD_VER >= 20 -# include <__utility/cmp.h> +# include <__cxx03/__utility/cmp.h> #endif #if _LIBCPP_STD_VER >= 23 -# include <__utility/forward_like.h> -# include <__utility/to_underlying.h> -# include <__utility/unreachable.h> +# include <__cxx03/__utility/forward_like.h> +# include <__cxx03/__utility/to_underlying.h> +# include <__cxx03/__utility/unreachable.h> #endif -#include +#include <__cxx03/version> // standard-mandated includes // [utility.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> // [tuple.creation] -#include <__tuple/ignore.h> +#include <__cxx03/__tuple/ignore.h> // [tuple.helper] -#include <__tuple/tuple_element.h> -#include <__tuple/tuple_size.h> +#include <__cxx03/__tuple/tuple_element.h> +#include <__cxx03/__tuple/tuple_size.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include +# include <__cxx03/limits> #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include +# include <__cxx03/cstdlib> +# include <__cxx03/iosfwd> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_UTILITY diff --git a/libcxx/include/__cxx03/valarray b/libcxx/include/__cxx03/valarray index 44341eb2ba6c1..890e0a2723c7a 100644 --- a/libcxx/include/__cxx03/valarray +++ b/libcxx/include/__cxx03/valarray @@ -343,39 +343,39 @@ template unspecified2 end(const valarray& v); */ -#include <__algorithm/copy.h> -#include <__algorithm/count.h> -#include <__algorithm/fill.h> -#include <__algorithm/max_element.h> -#include <__algorithm/min.h> -#include <__algorithm/min_element.h> -#include <__algorithm/unwrap_iter.h> -#include <__assert> -#include <__config> -#include <__functional/operations.h> -#include <__memory/addressof.h> -#include <__memory/allocator.h> -#include <__memory/uninitialized_algorithms.h> -#include <__type_traits/decay.h> -#include <__type_traits/remove_reference.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include -#include -#include -#include +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/count.h> +#include <__cxx03/__algorithm/fill.h> +#include <__cxx03/__algorithm/max_element.h> +#include <__cxx03/__algorithm/min.h> +#include <__cxx03/__algorithm/min_element.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__assert> +#include <__cxx03/__config> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocator.h> +#include <__cxx03/__memory/uninitialized_algorithms.h> +#include <__cxx03/__type_traits/decay.h> +#include <__cxx03/__type_traits/remove_reference.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/cmath> +#include <__cxx03/cstddef> +#include <__cxx03/new> +#include <__cxx03/version> // standard-mandated includes // [valarray.syn] -#include +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -3352,13 +3352,13 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/cstring> +# include <__cxx03/functional> +# include <__cxx03/stdexcept> +# include <__cxx03/type_traits> #endif #endif // _LIBCPP_VALARRAY diff --git a/libcxx/include/__cxx03/variant b/libcxx/include/__cxx03/variant index 5f2d03b7227b8..97e012bf81c95 100644 --- a/libcxx/include/__cxx03/variant +++ b/libcxx/include/__cxx03/variant @@ -212,66 +212,66 @@ namespace std { */ -#include <__compare/common_comparison_category.h> -#include <__compare/compare_three_way_result.h> -#include <__compare/three_way_comparable.h> -#include <__config> -#include <__exception/exception.h> -#include <__functional/hash.h> -#include <__functional/invoke.h> -#include <__functional/operations.h> -#include <__functional/unary_function.h> -#include <__memory/addressof.h> -#include <__memory/construct_at.h> -#include <__tuple/find_index.h> -#include <__tuple/sfinae_helpers.h> -#include <__type_traits/add_const.h> -#include <__type_traits/add_cv.h> -#include <__type_traits/add_pointer.h> -#include <__type_traits/add_volatile.h> -#include <__type_traits/common_type.h> -#include <__type_traits/conjunction.h> -#include <__type_traits/dependent_type.h> -#include <__type_traits/is_array.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_destructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_reference.h> -#include <__type_traits/is_trivially_assignable.h> -#include <__type_traits/is_trivially_constructible.h> -#include <__type_traits/is_trivially_destructible.h> -#include <__type_traits/is_trivially_relocatable.h> -#include <__type_traits/is_void.h> -#include <__type_traits/remove_const.h> -#include <__type_traits/remove_cvref.h> -#include <__type_traits/type_identity.h> -#include <__type_traits/void_t.h> -#include <__utility/declval.h> -#include <__utility/forward.h> -#include <__utility/forward_like.h> -#include <__utility/in_place.h> -#include <__utility/integer_sequence.h> -#include <__utility/move.h> -#include <__utility/swap.h> -#include <__variant/monostate.h> -#include <__verbose_abort> -#include -#include -#include -#include +#include <__cxx03/__compare/common_comparison_category.h> +#include <__cxx03/__compare/compare_three_way_result.h> +#include <__cxx03/__compare/three_way_comparable.h> +#include <__cxx03/__config> +#include <__cxx03/__exception/exception.h> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/invoke.h> +#include <__cxx03/__functional/operations.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/construct_at.h> +#include <__cxx03/__tuple/find_index.h> +#include <__cxx03/__tuple/sfinae_helpers.h> +#include <__cxx03/__type_traits/add_const.h> +#include <__cxx03/__type_traits/add_cv.h> +#include <__cxx03/__type_traits/add_pointer.h> +#include <__cxx03/__type_traits/add_volatile.h> +#include <__cxx03/__type_traits/common_type.h> +#include <__cxx03/__type_traits/conjunction.h> +#include <__cxx03/__type_traits/dependent_type.h> +#include <__cxx03/__type_traits/is_array.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_destructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/is_nothrow_constructible.h> +#include <__cxx03/__type_traits/is_reference.h> +#include <__cxx03/__type_traits/is_trivially_assignable.h> +#include <__cxx03/__type_traits/is_trivially_constructible.h> +#include <__cxx03/__type_traits/is_trivially_destructible.h> +#include <__cxx03/__type_traits/is_trivially_relocatable.h> +#include <__cxx03/__type_traits/is_void.h> +#include <__cxx03/__type_traits/remove_const.h> +#include <__cxx03/__type_traits/remove_cvref.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__type_traits/void_t.h> +#include <__cxx03/__utility/declval.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/forward_like.h> +#include <__cxx03/__utility/in_place.h> +#include <__cxx03/__utility/integer_sequence.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/__variant/monostate.h> +#include <__cxx03/__verbose_abort> +#include <__cxx03/initializer_list> +#include <__cxx03/limits> +#include <__cxx03/new> +#include <__cxx03/version> // standard-mandated includes // [variant.syn] -#include +#include <__cxx03/compare> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> namespace std { // explicitly not using versioning namespace @@ -1640,11 +1640,11 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include +# include <__cxx03/exception> +# include <__cxx03/tuple> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> #endif #endif // _LIBCPP_VARIANT diff --git a/libcxx/include/__cxx03/vector b/libcxx/include/__cxx03/vector index 0e4804a69ff8f..6e01ece06ab97 100644 --- a/libcxx/include/__cxx03/vector +++ b/libcxx/include/__cxx03/vector @@ -305,83 +305,83 @@ template requires is-vector-bool-reference // Since C++ // clang-format on -#include <__algorithm/copy.h> -#include <__algorithm/equal.h> -#include <__algorithm/fill_n.h> -#include <__algorithm/iterator_operations.h> -#include <__algorithm/lexicographical_compare.h> -#include <__algorithm/lexicographical_compare_three_way.h> -#include <__algorithm/remove.h> -#include <__algorithm/remove_if.h> -#include <__algorithm/rotate.h> -#include <__algorithm/unwrap_iter.h> -#include <__assert> -#include <__bit_reference> -#include <__concepts/same_as.h> -#include <__config> -#include <__debug_utils/sanitizers.h> -#include <__format/enable_insertable.h> -#include <__format/formatter.h> -#include <__format/formatter_bool.h> -#include <__functional/hash.h> -#include <__functional/unary_function.h> -#include <__fwd/vector.h> -#include <__iterator/advance.h> -#include <__iterator/bounded_iter.h> -#include <__iterator/distance.h> -#include <__iterator/iterator_traits.h> -#include <__iterator/reverse_iterator.h> -#include <__iterator/wrap_iter.h> -#include <__memory/addressof.h> -#include <__memory/allocate_at_least.h> -#include <__memory/allocator_traits.h> -#include <__memory/pointer_traits.h> -#include <__memory/swap_allocator.h> -#include <__memory/temp_value.h> -#include <__memory/uninitialized_algorithms.h> -#include <__memory_resource/polymorphic_allocator.h> -#include <__ranges/access.h> -#include <__ranges/concepts.h> -#include <__ranges/container_compatible_range.h> -#include <__ranges/from_range.h> -#include <__ranges/size.h> -#include <__split_buffer> -#include <__type_traits/is_allocator.h> -#include <__type_traits/is_constructible.h> -#include <__type_traits/is_nothrow_assignable.h> -#include <__type_traits/noexcept_move_assign_container.h> -#include <__type_traits/type_identity.h> -#include <__utility/exception_guard.h> -#include <__utility/forward.h> -#include <__utility/is_pointer_in_range.h> -#include <__utility/move.h> -#include <__utility/pair.h> -#include <__utility/swap.h> -#include -#include -#include -#include -#include +#include <__cxx03/__algorithm/copy.h> +#include <__cxx03/__algorithm/equal.h> +#include <__cxx03/__algorithm/fill_n.h> +#include <__cxx03/__algorithm/iterator_operations.h> +#include <__cxx03/__algorithm/lexicographical_compare.h> +#include <__cxx03/__algorithm/lexicographical_compare_three_way.h> +#include <__cxx03/__algorithm/remove.h> +#include <__cxx03/__algorithm/remove_if.h> +#include <__cxx03/__algorithm/rotate.h> +#include <__cxx03/__algorithm/unwrap_iter.h> +#include <__cxx03/__assert> +#include <__cxx03/__bit_reference> +#include <__cxx03/__concepts/same_as.h> +#include <__cxx03/__config> +#include <__cxx03/__debug_utils/sanitizers.h> +#include <__cxx03/__format/enable_insertable.h> +#include <__cxx03/__format/formatter.h> +#include <__cxx03/__format/formatter_bool.h> +#include <__cxx03/__functional/hash.h> +#include <__cxx03/__functional/unary_function.h> +#include <__cxx03/__fwd/vector.h> +#include <__cxx03/__iterator/advance.h> +#include <__cxx03/__iterator/bounded_iter.h> +#include <__cxx03/__iterator/distance.h> +#include <__cxx03/__iterator/iterator_traits.h> +#include <__cxx03/__iterator/reverse_iterator.h> +#include <__cxx03/__iterator/wrap_iter.h> +#include <__cxx03/__memory/addressof.h> +#include <__cxx03/__memory/allocate_at_least.h> +#include <__cxx03/__memory/allocator_traits.h> +#include <__cxx03/__memory/pointer_traits.h> +#include <__cxx03/__memory/swap_allocator.h> +#include <__cxx03/__memory/temp_value.h> +#include <__cxx03/__memory/uninitialized_algorithms.h> +#include <__cxx03/__memory_resource/polymorphic_allocator.h> +#include <__cxx03/__ranges/access.h> +#include <__cxx03/__ranges/concepts.h> +#include <__cxx03/__ranges/container_compatible_range.h> +#include <__cxx03/__ranges/from_range.h> +#include <__cxx03/__ranges/size.h> +#include <__cxx03/__split_buffer> +#include <__cxx03/__type_traits/is_allocator.h> +#include <__cxx03/__type_traits/is_constructible.h> +#include <__cxx03/__type_traits/is_nothrow_assignable.h> +#include <__cxx03/__type_traits/noexcept_move_assign_container.h> +#include <__cxx03/__type_traits/type_identity.h> +#include <__cxx03/__utility/exception_guard.h> +#include <__cxx03/__utility/forward.h> +#include <__cxx03/__utility/is_pointer_in_range.h> +#include <__cxx03/__utility/move.h> +#include <__cxx03/__utility/pair.h> +#include <__cxx03/__utility/swap.h> +#include <__cxx03/climits> +#include <__cxx03/cstring> +#include <__cxx03/limits> +#include <__cxx03/stdexcept> +#include <__cxx03/version> // standard-mandated includes // [iterator.range] -#include <__iterator/access.h> -#include <__iterator/data.h> -#include <__iterator/empty.h> -#include <__iterator/reverse_access.h> -#include <__iterator/size.h> +#include <__cxx03/__iterator/access.h> +#include <__cxx03/__iterator/data.h> +#include <__cxx03/__iterator/empty.h> +#include <__cxx03/__iterator/reverse_access.h> +#include <__cxx03/__iterator/size.h> // [vector.syn] -#include -#include +#include <__cxx03/compare> +#include <__cxx03/initializer_list> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif _LIBCPP_PUSH_MACROS -#include <__undef_macros> +#include <__cxx03/__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD @@ -3013,18 +3013,18 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 -# include -# include -# include -# include -# include +# include <__cxx03/algorithm> +# include <__cxx03/atomic> +# include <__cxx03/concepts> +# include <__cxx03/cstdlib> +# include <__cxx03/iosfwd> # if !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include +# include <__cxx03/locale> # endif -# include -# include -# include -# include +# include <__cxx03/tuple> +# include <__cxx03/type_traits> +# include <__cxx03/typeinfo> +# include <__cxx03/utility> #endif #endif // _LIBCPP_VECTOR diff --git a/libcxx/include/__cxx03/version b/libcxx/include/__cxx03/version index fe64343eafbc9..dd0fe4b4d2817 100644 --- a/libcxx/include/__cxx03/version +++ b/libcxx/include/__cxx03/version @@ -264,7 +264,7 @@ __cpp_lib_void_t 201411L */ -#include <__config> +#include <__cxx03/__config> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__cxx03/wchar.h b/libcxx/include/__cxx03/wchar.h index d4268c6d2c244..131948eb471d0 100644 --- a/libcxx/include/__cxx03/wchar.h +++ b/libcxx/include/__cxx03/wchar.h @@ -105,8 +105,8 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len, */ -# include <__config> -# include +# include <__cxx03/__config> +# include <__cxx03/stddef.h> # if defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) # error \ @@ -126,7 +126,7 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len, # if __has_include_next() # include_next # else -# include <__mbstate_t.h> // make sure we have mbstate_t regardless of the existence of +# include <__cxx03/__mbstate_t.h> // make sure we have mbstate_t regardless of the existence of # endif // Determine whether we have const-correct overloads for wcschr and friends. diff --git a/libcxx/include/__cxx03/wctype.h b/libcxx/include/__cxx03/wctype.h index c76ec5a3f0608..def009c58f015 100644 --- a/libcxx/include/__cxx03/wctype.h +++ b/libcxx/include/__cxx03/wctype.h @@ -44,7 +44,7 @@ wctrans_t wctrans(const char* property); */ -#include <__config> +#include <__cxx03/__config> #if defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) # error \ @@ -59,7 +59,7 @@ wctrans_t wctrans(const char* property); // In the future, we should unconditionally include_next here and instead // have a mode under which the library does not need libc++'s or // at all (i.e. a mode without wchar_t). As it stands, we need to do that to completely -// bypass the using declarations in when we did not include . +// bypass the using declarations in when we did not include <__cxx03/wctype.h>. // Otherwise, a using declaration like `using ::wint_t` in will refer to // nothing (with using_if_exists), and if we include another header that defines one // of these declarations (e.g. ), the second `using ::wint_t` with using_if_exists diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h index 2cb250dfd42da..32c56fc5cc126 100644 --- a/libcxx/include/__exception/exception_ptr.h +++ b/libcxx/include/__exception/exception_ptr.h @@ -14,7 +14,6 @@ #include <__memory/addressof.h> #include <__memory/construct_at.h> #include <__type_traits/decay.h> -#include #include #include #include diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h index cc20b038c871b..d560b6bbc35a7 100644 --- a/libcxx/include/__exception/nested_exception.h +++ b/libcxx/include/__exception/nested_exception.h @@ -22,7 +22,6 @@ #include <__type_traits/is_final.h> #include <__type_traits/is_polymorphic.h> #include <__utility/forward.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__exception/operations.h b/libcxx/include/__exception/operations.h index c8744eb297a4e..15520c558a0b4 100644 --- a/libcxx/include/__exception/operations.h +++ b/libcxx/include/__exception/operations.h @@ -10,7 +10,6 @@ #define _LIBCPP___EXCEPTION_OPERATIONS_H #include <__config> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__filesystem/directory_iterator.h b/libcxx/include/__filesystem/directory_iterator.h index e0246d8001e19..621e9bf9258ef 100644 --- a/libcxx/include/__filesystem/directory_iterator.h +++ b/libcxx/include/__filesystem/directory_iterator.h @@ -22,7 +22,6 @@ #include <__ranges/enable_view.h> #include <__system_error/error_code.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h index 2eb60810af02b..88c800fdf86d0 100644 --- a/libcxx/include/__filesystem/path.h +++ b/libcxx/include/__filesystem/path.h @@ -22,7 +22,6 @@ #include <__type_traits/remove_const.h> #include <__type_traits/remove_pointer.h> #include <__utility/move.h> -#include #include #include diff --git a/libcxx/include/__filesystem/path_iterator.h b/libcxx/include/__filesystem/path_iterator.h index f4d486d86cf38..e0f601662d462 100644 --- a/libcxx/include/__filesystem/path_iterator.h +++ b/libcxx/include/__filesystem/path_iterator.h @@ -14,9 +14,6 @@ #include <__config> #include <__filesystem/path.h> #include <__iterator/iterator_traits.h> -#include -#include -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__filesystem/recursive_directory_iterator.h b/libcxx/include/__filesystem/recursive_directory_iterator.h index caa1396eb301f..1be92a8bd5f49 100644 --- a/libcxx/include/__filesystem/recursive_directory_iterator.h +++ b/libcxx/include/__filesystem/recursive_directory_iterator.h @@ -21,7 +21,6 @@ #include <__ranges/enable_view.h> #include <__system_error/error_code.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h index 9ca32d5295bd2..5c14c0ac693b0 100644 --- a/libcxx/include/__flat_map/flat_map.h +++ b/libcxx/include/__flat_map/flat_map.h @@ -23,6 +23,7 @@ #include <__concepts/convertible_to.h> #include <__concepts/swappable.h> #include <__config> +#include <__cstddef/byte.h> #include <__flat_map/sorted_unique.h> #include <__functional/invoke.h> #include <__functional/is_transparent.h> diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h index ce9ac0c81e315..618b8ef025643 100644 --- a/libcxx/include/__format/buffer.h +++ b/libcxx/include/__format/buffer.h @@ -37,7 +37,6 @@ #include <__type_traits/conditional.h> #include <__utility/exception_guard.h> #include <__utility/move.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__format/escaped_output_table.h b/libcxx/include/__format/escaped_output_table.h index bdf86cb6f99cc..7a0b35239861e 100644 --- a/libcxx/include/__format/escaped_output_table.h +++ b/libcxx/include/__format/escaped_output_table.h @@ -63,7 +63,7 @@ #include <__algorithm/ranges_upper_bound.h> #include <__config> -#include +#include <__cstddef/ptrdiff_t.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__format/extended_grapheme_cluster_table.h b/libcxx/include/__format/extended_grapheme_cluster_table.h index 7dbc239f5f5cd..7653a9e03b815 100644 --- a/libcxx/include/__format/extended_grapheme_cluster_table.h +++ b/libcxx/include/__format/extended_grapheme_cluster_table.h @@ -63,8 +63,8 @@ #include <__algorithm/ranges_upper_bound.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/access.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h index 3f39d2aec81f7..a973ccd43c420 100644 --- a/libcxx/include/__format/format_arg.h +++ b/libcxx/include/__format/format_arg.h @@ -13,6 +13,7 @@ #include <__assert> #include <__concepts/arithmetic.h> #include <__config> +#include <__cstddef/size_t.h> #include <__format/concepts.h> #include <__format/format_parse_context.h> #include <__functional/invoke.h> diff --git a/libcxx/include/__format/format_args.h b/libcxx/include/__format/format_args.h index e19b4458e41a5..b98663c06ea4d 100644 --- a/libcxx/include/__format/format_args.h +++ b/libcxx/include/__format/format_args.h @@ -11,10 +11,10 @@ #define _LIBCPP___FORMAT_FORMAT_ARGS_H #include <__config> +#include <__cstddef/size_t.h> #include <__format/format_arg.h> #include <__format/format_arg_store.h> #include <__fwd/format.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h index 019a71011a714..ecbf62770cdea 100644 --- a/libcxx/include/__format/format_context.h +++ b/libcxx/include/__format/format_context.h @@ -23,7 +23,6 @@ #include <__memory/addressof.h> #include <__utility/move.h> #include <__variant/monostate.h> -#include #ifndef _LIBCPP_HAS_NO_LOCALIZATION # include <__locale> diff --git a/libcxx/include/__format/format_string.h b/libcxx/include/__format/format_string.h index a499afee8874a..5db5973dd5889 100644 --- a/libcxx/include/__format/format_string.h +++ b/libcxx/include/__format/format_string.h @@ -12,10 +12,10 @@ #include <__assert> #include <__config> +#include <__cstddef/size_t.h> #include <__format/format_error.h> #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> // iter_value_t -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h index fc95dd3f22bbe..90720a5098f0f 100644 --- a/libcxx/include/__format/formatter_floating_point.h +++ b/libcxx/include/__format/formatter_floating_point.h @@ -23,6 +23,7 @@ #include <__concepts/arithmetic.h> #include <__concepts/same_as.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__format/concepts.h> #include <__format/format_parse_context.h> #include <__format/formatter.h> @@ -36,7 +37,6 @@ #include <__utility/move.h> #include <__utility/unreachable.h> #include -#include #ifndef _LIBCPP_HAS_NO_LOCALIZATION # include <__locale> diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h index 34c4c87313a45..457f5f53b2dc5 100644 --- a/libcxx/include/__format/formatter_output.h +++ b/libcxx/include/__format/formatter_output.h @@ -16,6 +16,8 @@ #include <__bit/countl.h> #include <__concepts/same_as.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> +#include <__cstddef/size_t.h> #include <__format/buffer.h> #include <__format/concepts.h> #include <__format/formatter.h> @@ -28,7 +30,6 @@ #include <__memory/pointer_traits.h> #include <__utility/move.h> #include <__utility/unreachable.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__format/formatter_pointer.h b/libcxx/include/__format/formatter_pointer.h index 6e0fa9a1b4f19..4ef48c168d0d8 100644 --- a/libcxx/include/__format/formatter_pointer.h +++ b/libcxx/include/__format/formatter_pointer.h @@ -11,13 +11,13 @@ #define _LIBCPP___FORMAT_FORMATTER_POINTER_H #include <__config> +#include <__cstddef/nullptr_t.h> #include <__format/concepts.h> #include <__format/format_parse_context.h> #include <__format/formatter.h> #include <__format/formatter_integral.h> #include <__format/formatter_output.h> #include <__format/parser_std_format_spec.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__format/indic_conjunct_break_table.h b/libcxx/include/__format/indic_conjunct_break_table.h index 39dd45da771fc..df6cfe6a02f34 100644 --- a/libcxx/include/__format/indic_conjunct_break_table.h +++ b/libcxx/include/__format/indic_conjunct_break_table.h @@ -63,8 +63,8 @@ #include <__algorithm/ranges_upper_bound.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/access.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__format/width_estimation_table.h b/libcxx/include/__format/width_estimation_table.h index 23a08746b9103..5b4b3950c6a1d 100644 --- a/libcxx/include/__format/width_estimation_table.h +++ b/libcxx/include/__format/width_estimation_table.h @@ -63,7 +63,7 @@ #include <__algorithm/ranges_upper_bound.h> #include <__config> -#include +#include <__cstddef/ptrdiff_t.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__functional/bind.h b/libcxx/include/__functional/bind.h index 4251ef74ab7bd..f82c1517249b1 100644 --- a/libcxx/include/__functional/bind.h +++ b/libcxx/include/__functional/bind.h @@ -17,7 +17,6 @@ #include <__type_traits/invoke.h> #include <__type_traits/is_reference_wrapper.h> #include <__type_traits/is_void.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__functional/hash.h b/libcxx/include/__functional/hash.h index 8abec99f72905..f7b89f759b5f5 100644 --- a/libcxx/include/__functional/hash.h +++ b/libcxx/include/__functional/hash.h @@ -20,7 +20,6 @@ #include <__type_traits/underlying_type.h> #include <__utility/pair.h> #include <__utility/swap.h> -#include #include #include diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table index 560e873adc384..8e4cb3c914dc4 100644 --- a/libcxx/include/__hash_table +++ b/libcxx/include/__hash_table @@ -15,6 +15,7 @@ #include <__assert> #include <__bit/countl.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__functional/hash.h> #include <__iterator/iterator_traits.h> #include <__math/rounding_functions.h> diff --git a/libcxx/include/__iterator/access.h b/libcxx/include/__iterator/access.h index acc4f60bf697e..d42855f925487 100644 --- a/libcxx/include/__iterator/access.h +++ b/libcxx/include/__iterator/access.h @@ -11,7 +11,7 @@ #define _LIBCPP___ITERATOR_ACCESS_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/aliasing_iterator.h b/libcxx/include/__iterator/aliasing_iterator.h index 94ba577078b5e..aeb5b4a88ec37 100644 --- a/libcxx/include/__iterator/aliasing_iterator.h +++ b/libcxx/include/__iterator/aliasing_iterator.h @@ -10,10 +10,10 @@ #define _LIBCPP___ITERATOR_ALIASING_ITERATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/iterator_traits.h> #include <__memory/pointer_traits.h> #include <__type_traits/is_trivial.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/back_insert_iterator.h b/libcxx/include/__iterator/back_insert_iterator.h index 6d3dd4b12966f..9a59487533885 100644 --- a/libcxx/include/__iterator/back_insert_iterator.h +++ b/libcxx/include/__iterator/back_insert_iterator.h @@ -11,11 +11,11 @@ #define _LIBCPP___ITERATOR_BACK_INSERT_ITERATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> #include <__memory/addressof.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/data.h b/libcxx/include/__iterator/data.h index b7c1603652b0e..5f2624c2b819e 100644 --- a/libcxx/include/__iterator/data.h +++ b/libcxx/include/__iterator/data.h @@ -11,7 +11,6 @@ #define _LIBCPP___ITERATOR_DATA_H #include <__config> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__iterator/empty.h b/libcxx/include/__iterator/empty.h index 773f2776955b2..f2c653bcb329b 100644 --- a/libcxx/include/__iterator/empty.h +++ b/libcxx/include/__iterator/empty.h @@ -11,7 +11,6 @@ #define _LIBCPP___ITERATOR_EMPTY_H #include <__config> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__iterator/front_insert_iterator.h b/libcxx/include/__iterator/front_insert_iterator.h index 7f2c54ec87442..80819cd22ae6c 100644 --- a/libcxx/include/__iterator/front_insert_iterator.h +++ b/libcxx/include/__iterator/front_insert_iterator.h @@ -11,11 +11,11 @@ #define _LIBCPP___ITERATOR_FRONT_INSERT_ITERATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> #include <__memory/addressof.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/incrementable_traits.h b/libcxx/include/__iterator/incrementable_traits.h index a228b228f6e55..37c8daddf8a86 100644 --- a/libcxx/include/__iterator/incrementable_traits.h +++ b/libcxx/include/__iterator/incrementable_traits.h @@ -12,13 +12,13 @@ #include <__concepts/arithmetic.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__type_traits/conditional.h> #include <__type_traits/is_object.h> #include <__type_traits/is_primary_template.h> #include <__type_traits/make_signed.h> #include <__type_traits/remove_cvref.h> #include <__utility/declval.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/insert_iterator.h b/libcxx/include/__iterator/insert_iterator.h index 8b7574dc9ec0a..b3311042014f8 100644 --- a/libcxx/include/__iterator/insert_iterator.h +++ b/libcxx/include/__iterator/insert_iterator.h @@ -11,12 +11,12 @@ #define _LIBCPP___ITERATOR_INSERT_ITERATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> #include <__memory/addressof.h> #include <__ranges/access.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/istream_iterator.h b/libcxx/include/__iterator/istream_iterator.h index 58c9ac6d4ccce..a6c74d00178d2 100644 --- a/libcxx/include/__iterator/istream_iterator.h +++ b/libcxx/include/__iterator/istream_iterator.h @@ -11,13 +11,13 @@ #define _LIBCPP___ITERATOR_ISTREAM_ITERATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__fwd/istream.h> #include <__fwd/string.h> #include <__iterator/default_sentinel.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> #include <__memory/addressof.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/istreambuf_iterator.h b/libcxx/include/__iterator/istreambuf_iterator.h index 51c4ecff351f5..162873b9559ec 100644 --- a/libcxx/include/__iterator/istreambuf_iterator.h +++ b/libcxx/include/__iterator/istreambuf_iterator.h @@ -16,6 +16,8 @@ #include <__iterator/default_sentinel.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> +#include <__string/char_traits.h> +#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/iterator.h b/libcxx/include/__iterator/iterator.h index ba9308f3c2243..1591655313dde 100644 --- a/libcxx/include/__iterator/iterator.h +++ b/libcxx/include/__iterator/iterator.h @@ -11,7 +11,7 @@ #define _LIBCPP___ITERATOR_ITERATOR_H #include <__config> -#include +#include <__cstddef/ptrdiff_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h index 4d9ad480cc4a2..eb6ba8b62fb30 100644 --- a/libcxx/include/__iterator/iterator_traits.h +++ b/libcxx/include/__iterator/iterator_traits.h @@ -18,6 +18,7 @@ #include <__concepts/same_as.h> #include <__concepts/totally_ordered.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__fwd/pair.h> #include <__iterator/incrementable_traits.h> #include <__iterator/readable_traits.h> @@ -36,7 +37,6 @@ #include <__type_traits/remove_cvref.h> #include <__type_traits/void_t.h> #include <__utility/declval.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/ostream_iterator.h b/libcxx/include/__iterator/ostream_iterator.h index 05697e62d9dcb..93ecc03010d07 100644 --- a/libcxx/include/__iterator/ostream_iterator.h +++ b/libcxx/include/__iterator/ostream_iterator.h @@ -11,12 +11,12 @@ #define _LIBCPP___ITERATOR_OSTREAM_ITERATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__fwd/ostream.h> #include <__fwd/string.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> #include <__memory/addressof.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/ostreambuf_iterator.h b/libcxx/include/__iterator/ostreambuf_iterator.h index 401b6f3f23603..621ffd4f988c3 100644 --- a/libcxx/include/__iterator/ostreambuf_iterator.h +++ b/libcxx/include/__iterator/ostreambuf_iterator.h @@ -11,12 +11,12 @@ #define _LIBCPP___ITERATOR_OSTREAMBUF_ITERATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__fwd/ios.h> #include <__fwd/ostream.h> #include <__fwd/streambuf.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> -#include #include // for forward declaration of ostreambuf_iterator #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__iterator/reverse_access.h b/libcxx/include/__iterator/reverse_access.h index 54d7270b04a53..f6e60c3fb75b3 100644 --- a/libcxx/include/__iterator/reverse_access.h +++ b/libcxx/include/__iterator/reverse_access.h @@ -12,7 +12,6 @@ #include <__config> #include <__iterator/reverse_iterator.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__iterator/segmented_iterator.h b/libcxx/include/__iterator/segmented_iterator.h index f3cd1e5fa1f5d..8cb54a35a7f5e 100644 --- a/libcxx/include/__iterator/segmented_iterator.h +++ b/libcxx/include/__iterator/segmented_iterator.h @@ -41,8 +41,8 @@ // Returns the iterator composed of the segment iterator and local iterator. #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/size.h b/libcxx/include/__iterator/size.h index 876e6963f77d9..84e2e3b21f1d5 100644 --- a/libcxx/include/__iterator/size.h +++ b/libcxx/include/__iterator/size.h @@ -11,9 +11,10 @@ #define _LIBCPP___ITERATOR_SIZE_H #include <__config> +#include <__cstddef/ptrdiff_t.h> +#include <__cstddef/size_t.h> #include <__type_traits/common_type.h> #include <__type_traits/make_signed.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h index 549d8ff2dbd7d..2856833e60079 100644 --- a/libcxx/include/__iterator/wrap_iter.h +++ b/libcxx/include/__iterator/wrap_iter.h @@ -13,13 +13,13 @@ #include <__compare/ordering.h> #include <__compare/three_way_comparable.h> #include <__config> +#include <__cstddef/size_t.h> #include <__iterator/iterator_traits.h> #include <__memory/addressof.h> #include <__memory/pointer_traits.h> #include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__mdspan/default_accessor.h b/libcxx/include/__mdspan/default_accessor.h index 1cc5f15545fc8..d6f3ddb998e96 100644 --- a/libcxx/include/__mdspan/default_accessor.h +++ b/libcxx/include/__mdspan/default_accessor.h @@ -18,12 +18,11 @@ #define _LIBCPP___MDSPAN_DEFAULT_ACCESSOR_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/is_abstract.h> #include <__type_traits/is_array.h> #include <__type_traits/is_convertible.h> #include <__type_traits/remove_const.h> -#include -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h index 3d2c2771a834b..edbc30a7a40e4 100644 --- a/libcxx/include/__mdspan/extents.h +++ b/libcxx/include/__mdspan/extents.h @@ -21,6 +21,7 @@ #include <__config> #include <__concepts/arithmetic.h> +#include <__cstddef/byte.h> #include <__type_traits/common_type.h> #include <__type_traits/is_convertible.h> #include <__type_traits/is_nothrow_constructible.h> @@ -29,9 +30,7 @@ #include <__utility/integer_sequence.h> #include <__utility/unreachable.h> #include -#include #include -#include #include #include diff --git a/libcxx/include/__mdspan/layout_left.h b/libcxx/include/__mdspan/layout_left.h index 59574e83b0d7b..288b3dd8038ee 100644 --- a/libcxx/include/__mdspan/layout_left.h +++ b/libcxx/include/__mdspan/layout_left.h @@ -27,9 +27,6 @@ #include <__type_traits/is_nothrow_constructible.h> #include <__utility/integer_sequence.h> #include -#include -#include -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__mdspan/layout_right.h b/libcxx/include/__mdspan/layout_right.h index d1acdb41238f7..72922d1049c7a 100644 --- a/libcxx/include/__mdspan/layout_right.h +++ b/libcxx/include/__mdspan/layout_right.h @@ -19,6 +19,7 @@ #include <__assert> #include <__config> +#include <__cstddef/size_t.h> #include <__fwd/mdspan.h> #include <__mdspan/extents.h> #include <__type_traits/common_type.h> @@ -26,9 +27,6 @@ #include <__type_traits/is_convertible.h> #include <__type_traits/is_nothrow_constructible.h> #include <__utility/integer_sequence.h> -#include -#include -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__mdspan/layout_stride.h b/libcxx/include/__mdspan/layout_stride.h index c57f596431c7c..bb93de9775145 100644 --- a/libcxx/include/__mdspan/layout_stride.h +++ b/libcxx/include/__mdspan/layout_stride.h @@ -32,8 +32,6 @@ #include <__utility/integer_sequence.h> #include <__utility/swap.h> #include -#include -#include #include #include diff --git a/libcxx/include/__mdspan/mdspan.h b/libcxx/include/__mdspan/mdspan.h index 1ff4fd4ba4a82..3f9b35b185b16 100644 --- a/libcxx/include/__mdspan/mdspan.h +++ b/libcxx/include/__mdspan/mdspan.h @@ -37,9 +37,6 @@ #include <__type_traits/remove_reference.h> #include <__utility/integer_sequence.h> #include -#include -#include -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/align.h b/libcxx/include/__memory/align.h index bbb995f4a8c8e..402eac3380925 100644 --- a/libcxx/include/__memory/align.h +++ b/libcxx/include/__memory/align.h @@ -10,7 +10,7 @@ #define _LIBCPP___MEMORY_ALIGN_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory/aligned_alloc.h b/libcxx/include/__memory/aligned_alloc.h index 33fe8af77df7b..fb36983d9c3dc 100644 --- a/libcxx/include/__memory/aligned_alloc.h +++ b/libcxx/include/__memory/aligned_alloc.h @@ -10,7 +10,6 @@ #define _LIBCPP___MEMORY_ALIGNED_ALLOC_H #include <__config> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/allocate_at_least.h b/libcxx/include/__memory/allocate_at_least.h index a10e4fbaead38..9b5a8bcbd4596 100644 --- a/libcxx/include/__memory/allocate_at_least.h +++ b/libcxx/include/__memory/allocate_at_least.h @@ -10,8 +10,8 @@ #define _LIBCPP___MEMORY_ALLOCATE_AT_LEAST_H #include <__config> +#include <__cstddef/size_t.h> #include <__memory/allocator_traits.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory/allocation_guard.h b/libcxx/include/__memory/allocation_guard.h index cb870af7be676..66d6a5002c299 100644 --- a/libcxx/include/__memory/allocation_guard.h +++ b/libcxx/include/__memory/allocation_guard.h @@ -14,7 +14,6 @@ #include <__memory/addressof.h> #include <__memory/allocator_traits.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h index cd146da8e7eb5..ddb4179940b8b 100644 --- a/libcxx/include/__memory/allocator.h +++ b/libcxx/include/__memory/allocator.h @@ -11,6 +11,7 @@ #define _LIBCPP___MEMORY_ALLOCATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__memory/addressof.h> #include <__memory/allocate_at_least.h> #include <__memory/allocator_traits.h> @@ -20,7 +21,6 @@ #include <__type_traits/is_void.h> #include <__type_traits/is_volatile.h> #include <__utility/forward.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/allocator_traits.h b/libcxx/include/__memory/allocator_traits.h index f4d9679807ae0..499b30b85b6c9 100644 --- a/libcxx/include/__memory/allocator_traits.h +++ b/libcxx/include/__memory/allocator_traits.h @@ -11,6 +11,7 @@ #define _LIBCPP___MEMORY_ALLOCATOR_TRAITS_H #include <__config> +#include <__cstddef/size_t.h> #include <__fwd/memory.h> #include <__memory/construct_at.h> #include <__memory/pointer_traits.h> @@ -23,7 +24,6 @@ #include <__type_traits/void_t.h> #include <__utility/declval.h> #include <__utility/forward.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/array_cookie.h b/libcxx/include/__memory/array_cookie.h index 10b29c9dcc78e..806a9e99ecafe 100644 --- a/libcxx/include/__memory/array_cookie.h +++ b/libcxx/include/__memory/array_cookie.h @@ -12,10 +12,10 @@ #include <__config> #include <__configuration/abi.h> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_trivially_destructible.h> #include <__type_traits/negation.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory/assume_aligned.h b/libcxx/include/__memory/assume_aligned.h index c7ba2a99c7e22..08f1772cd6dfa 100644 --- a/libcxx/include/__memory/assume_aligned.h +++ b/libcxx/include/__memory/assume_aligned.h @@ -12,8 +12,8 @@ #include <__assert> #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/is_constant_evaluated.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/builtin_new_allocator.h b/libcxx/include/__memory/builtin_new_allocator.h index c6f7f3c5ff52a..128288efb05bc 100644 --- a/libcxx/include/__memory/builtin_new_allocator.h +++ b/libcxx/include/__memory/builtin_new_allocator.h @@ -11,7 +11,6 @@ #include <__config> #include <__memory/unique_ptr.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h index 9d44775bdb487..a7acaaff9da09 100644 --- a/libcxx/include/__memory/compressed_pair.h +++ b/libcxx/include/__memory/compressed_pair.h @@ -11,6 +11,7 @@ #define _LIBCPP___MEMORY_COMPRESSED_PAIR_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/datasizeof.h> #include <__type_traits/is_empty.h> #include <__type_traits/is_final.h> diff --git a/libcxx/include/__memory/destruct_n.h b/libcxx/include/__memory/destruct_n.h index 78635ad0af04b..66adefb0f51fc 100644 --- a/libcxx/include/__memory/destruct_n.h +++ b/libcxx/include/__memory/destruct_n.h @@ -10,9 +10,9 @@ #define _LIBCPP___MEMORY_DESTRUCT_N_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_trivially_destructible.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h index 98961ddf9709f..4acf3d18401ae 100644 --- a/libcxx/include/__memory/pointer_traits.h +++ b/libcxx/include/__memory/pointer_traits.h @@ -11,6 +11,7 @@ #define _LIBCPP___MEMORY_POINTER_TRAITS_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__memory/addressof.h> #include <__type_traits/conditional.h> #include <__type_traits/conjunction.h> @@ -23,7 +24,6 @@ #include <__type_traits/void_t.h> #include <__utility/declval.h> #include <__utility/forward.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory/raw_storage_iterator.h b/libcxx/include/__memory/raw_storage_iterator.h index 774878aa1c5e8..2ee4c074d8d33 100644 --- a/libcxx/include/__memory/raw_storage_iterator.h +++ b/libcxx/include/__memory/raw_storage_iterator.h @@ -11,11 +11,11 @@ #define _LIBCPP___MEMORY_RAW_STORAGE_ITERATOR_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/iterator.h> #include <__iterator/iterator_traits.h> #include <__memory/addressof.h> #include <__utility/move.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h index 65870ba574c25..e5adbedce1a2d 100644 --- a/libcxx/include/__memory/shared_ptr.h +++ b/libcxx/include/__memory/shared_ptr.h @@ -13,6 +13,7 @@ #include <__compare/compare_three_way.h> #include <__compare/ordering.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__exception/exception.h> #include <__functional/binary_function.h> #include <__functional/operations.h> @@ -51,7 +52,6 @@ #include <__utility/move.h> #include <__utility/swap.h> #include <__verbose_abort> -#include #include #include #if _LIBCPP_HAS_ATOMIC_HEADER diff --git a/libcxx/include/__memory/temporary_buffer.h b/libcxx/include/__memory/temporary_buffer.h index 219e03f99bc01..d18717f52d1cd 100644 --- a/libcxx/include/__memory/temporary_buffer.h +++ b/libcxx/include/__memory/temporary_buffer.h @@ -11,10 +11,9 @@ #define _LIBCPP___MEMORY_TEMPORARY_BUFFER_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__memory/unique_temporary_buffer.h> #include <__utility/pair.h> -#include -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h index 6e42ef1eaa1a3..4ed6393b1209f 100644 --- a/libcxx/include/__memory/unique_ptr.h +++ b/libcxx/include/__memory/unique_ptr.h @@ -15,6 +15,8 @@ #include <__compare/compare_three_way_result.h> #include <__compare/three_way_comparable.h> #include <__config> +#include <__cstddef/nullptr_t.h> +#include <__cstddef/size_t.h> #include <__functional/hash.h> #include <__functional/operations.h> #include <__memory/allocator_traits.h> // __pointer @@ -46,7 +48,6 @@ #include <__utility/forward.h> #include <__utility/move.h> #include <__utility/private_constructor_tag.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/unique_temporary_buffer.h b/libcxx/include/__memory/unique_temporary_buffer.h index 8a8c327be69fe..4f47c84e2f8db 100644 --- a/libcxx/include/__memory/unique_temporary_buffer.h +++ b/libcxx/include/__memory/unique_temporary_buffer.h @@ -13,10 +13,10 @@ #include <__assert> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__memory/allocator.h> #include <__memory/unique_ptr.h> #include <__type_traits/is_constant_evaluated.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory/uses_allocator.h b/libcxx/include/__memory/uses_allocator.h index 16504e8b2a998..20aa0e6b7f272 100644 --- a/libcxx/include/__memory/uses_allocator.h +++ b/libcxx/include/__memory/uses_allocator.h @@ -13,7 +13,6 @@ #include <__config> #include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory_resource/memory_resource.h b/libcxx/include/__memory_resource/memory_resource.h index ea85e50cd568b..f93f10fe21a2d 100644 --- a/libcxx/include/__memory_resource/memory_resource.h +++ b/libcxx/include/__memory_resource/memory_resource.h @@ -10,8 +10,9 @@ #define _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H #include <__config> +#include <__cstddef/max_align_t.h> +#include <__cstddef/size_t.h> #include <__fwd/memory_resource.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory_resource/monotonic_buffer_resource.h b/libcxx/include/__memory_resource/monotonic_buffer_resource.h index f45b30fdb3861..c5a2b556707f6 100644 --- a/libcxx/include/__memory_resource/monotonic_buffer_resource.h +++ b/libcxx/include/__memory_resource/monotonic_buffer_resource.h @@ -10,9 +10,9 @@ #define _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H #include <__config> +#include <__cstddef/size_t.h> #include <__memory/addressof.h> #include <__memory_resource/memory_resource.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h index fb36d5cad78ec..30fa5c2170d50 100644 --- a/libcxx/include/__memory_resource/polymorphic_allocator.h +++ b/libcxx/include/__memory_resource/polymorphic_allocator.h @@ -11,10 +11,11 @@ #include <__assert> #include <__config> +#include <__cstddef/byte.h> +#include <__cstddef/max_align_t.h> #include <__fwd/pair.h> #include <__memory_resource/memory_resource.h> #include <__utility/exception_guard.h> -#include #include #include #include diff --git a/libcxx/include/__memory_resource/pool_options.h b/libcxx/include/__memory_resource/pool_options.h index 442959836c7ef..324b8aaa8502b 100644 --- a/libcxx/include/__memory_resource/pool_options.h +++ b/libcxx/include/__memory_resource/pool_options.h @@ -10,7 +10,7 @@ #define _LIBCPP___MEMORY_RESOURCE_POOL_OPTIONS_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__memory_resource/synchronized_pool_resource.h b/libcxx/include/__memory_resource/synchronized_pool_resource.h index 50a673c2861d1..2679afc16617b 100644 --- a/libcxx/include/__memory_resource/synchronized_pool_resource.h +++ b/libcxx/include/__memory_resource/synchronized_pool_resource.h @@ -13,7 +13,6 @@ #include <__memory_resource/memory_resource.h> #include <__memory_resource/pool_options.h> #include <__memory_resource/unsynchronized_pool_resource.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h index 783db84262af7..92da16c559fea 100644 --- a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h +++ b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h @@ -10,9 +10,9 @@ #define _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H #include <__config> +#include <__cstddef/size_t.h> #include <__memory_resource/memory_resource.h> #include <__memory_resource/pool_options.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__numeric/midpoint.h b/libcxx/include/__numeric/midpoint.h index 5ef30d4ec50f5..2ba80e5cca07d 100644 --- a/libcxx/include/__numeric/midpoint.h +++ b/libcxx/include/__numeric/midpoint.h @@ -11,6 +11,7 @@ #define _LIBCPP___NUMERIC_MIDPOINT_H #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_floating_point.h> #include <__type_traits/is_integral.h> @@ -21,7 +22,6 @@ #include <__type_traits/is_void.h> #include <__type_traits/make_unsigned.h> #include <__type_traits/remove_pointer.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__ostream/basic_ostream.h b/libcxx/include/__ostream/basic_ostream.h index 1b1c026706cdf..fc72247725b49 100644 --- a/libcxx/include/__ostream/basic_ostream.h +++ b/libcxx/include/__ostream/basic_ostream.h @@ -23,7 +23,6 @@ # include <__type_traits/void_t.h> # include <__utility/declval.h> # include -# include # include # include # include // for __throw_bad_alloc diff --git a/libcxx/include/__pstl/backends/libdispatch.h b/libcxx/include/__pstl/backends/libdispatch.h index a92d0978e5c64..701367b505c8b 100644 --- a/libcxx/include/__pstl/backends/libdispatch.h +++ b/libcxx/include/__pstl/backends/libdispatch.h @@ -16,6 +16,7 @@ #include <__algorithm/upper_bound.h> #include <__atomic/atomic.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__exception/terminate.h> #include <__iterator/iterator_traits.h> #include <__iterator/move_iterator.h> @@ -37,7 +38,6 @@ #include <__utility/exception_guard.h> #include <__utility/move.h> #include <__utility/pair.h> -#include #include #include diff --git a/libcxx/include/__pstl/backends/std_thread.h b/libcxx/include/__pstl/backends/std_thread.h index 19b985f860a17..dd2c3f15403e3 100644 --- a/libcxx/include/__pstl/backends/std_thread.h +++ b/libcxx/include/__pstl/backends/std_thread.h @@ -22,7 +22,6 @@ #include <__pstl/cpu_algos/transform_reduce.h> #include <__utility/empty.h> #include <__utility/move.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h index 5e59752fa5723..ec1622419d049 100644 --- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h +++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h @@ -10,7 +10,6 @@ #define _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H #include <__config> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__pstl/cpu_algos/find_if.h b/libcxx/include/__pstl/cpu_algos/find_if.h index cd92e5a99f12f..ebb4ecb4a0ed3 100644 --- a/libcxx/include/__pstl/cpu_algos/find_if.h +++ b/libcxx/include/__pstl/cpu_algos/find_if.h @@ -21,7 +21,6 @@ #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> #include <__utility/pair.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__pstl/cpu_algos/transform_reduce.h b/libcxx/include/__pstl/cpu_algos/transform_reduce.h index aafbf1ca96b40..e9f622d832cd5 100644 --- a/libcxx/include/__pstl/cpu_algos/transform_reduce.h +++ b/libcxx/include/__pstl/cpu_algos/transform_reduce.h @@ -20,7 +20,6 @@ #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> -#include #include #include diff --git a/libcxx/include/__random/discard_block_engine.h b/libcxx/include/__random/discard_block_engine.h index 07f599067279e..f319557a57365 100644 --- a/libcxx/include/__random/discard_block_engine.h +++ b/libcxx/include/__random/discard_block_engine.h @@ -10,11 +10,11 @@ #define _LIBCPP___RANDOM_DISCARD_BLOCK_ENGINE_H #include <__config> +#include <__cstddef/size_t.h> #include <__random/is_seed_sequence.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_convertible.h> #include <__utility/move.h> -#include #include #include diff --git a/libcxx/include/__random/discrete_distribution.h b/libcxx/include/__random/discrete_distribution.h index 931f7704ff976..3ce4a495fb0c3 100644 --- a/libcxx/include/__random/discrete_distribution.h +++ b/libcxx/include/__random/discrete_distribution.h @@ -14,7 +14,6 @@ #include <__random/is_valid.h> #include <__random/uniform_real_distribution.h> #include <__vector/vector.h> -#include #include #include #include diff --git a/libcxx/include/__random/independent_bits_engine.h b/libcxx/include/__random/independent_bits_engine.h index 0f4a7b82b98f8..20f56e9b57560 100644 --- a/libcxx/include/__random/independent_bits_engine.h +++ b/libcxx/include/__random/independent_bits_engine.h @@ -10,6 +10,7 @@ #define _LIBCPP___RANDOM_INDEPENDENT_BITS_ENGINE_H #include <__config> +#include <__cstddef/size_t.h> #include <__fwd/istream.h> #include <__fwd/ostream.h> #include <__random/is_seed_sequence.h> @@ -18,7 +19,6 @@ #include <__type_traits/enable_if.h> #include <__type_traits/is_convertible.h> #include <__utility/move.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__random/log2.h b/libcxx/include/__random/log2.h index c96a5247ff6db..fbf35bab91728 100644 --- a/libcxx/include/__random/log2.h +++ b/libcxx/include/__random/log2.h @@ -10,8 +10,8 @@ #define _LIBCPP___RANDOM_LOG2_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/conditional.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__random/mersenne_twister_engine.h b/libcxx/include/__random/mersenne_twister_engine.h index 1f50e608ce8d4..9dd87f9ce71a1 100644 --- a/libcxx/include/__random/mersenne_twister_engine.h +++ b/libcxx/include/__random/mersenne_twister_engine.h @@ -12,9 +12,9 @@ #include <__algorithm/equal.h> #include <__algorithm/min.h> #include <__config> +#include <__cstddef/size_t.h> #include <__random/is_seed_sequence.h> #include <__type_traits/enable_if.h> -#include #include #include #include diff --git a/libcxx/include/__random/piecewise_constant_distribution.h b/libcxx/include/__random/piecewise_constant_distribution.h index a864f848143ba..5afe2ebeda3bb 100644 --- a/libcxx/include/__random/piecewise_constant_distribution.h +++ b/libcxx/include/__random/piecewise_constant_distribution.h @@ -11,6 +11,7 @@ #include <__algorithm/upper_bound.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__random/is_valid.h> #include <__random/uniform_real_distribution.h> #include <__vector/vector.h> diff --git a/libcxx/include/__random/piecewise_linear_distribution.h b/libcxx/include/__random/piecewise_linear_distribution.h index 24aa6cce91cf2..0d14f882cbbb2 100644 --- a/libcxx/include/__random/piecewise_linear_distribution.h +++ b/libcxx/include/__random/piecewise_linear_distribution.h @@ -11,6 +11,7 @@ #include <__algorithm/upper_bound.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__random/is_valid.h> #include <__random/uniform_real_distribution.h> #include <__vector/comparison.h> diff --git a/libcxx/include/__random/shuffle_order_engine.h b/libcxx/include/__random/shuffle_order_engine.h index f54ed17e38383..53f6c08971105 100644 --- a/libcxx/include/__random/shuffle_order_engine.h +++ b/libcxx/include/__random/shuffle_order_engine.h @@ -11,12 +11,12 @@ #include <__algorithm/equal.h> #include <__config> +#include <__cstddef/size_t.h> #include <__random/is_seed_sequence.h> #include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> #include <__utility/move.h> -#include #include #include diff --git a/libcxx/include/__random/subtract_with_carry_engine.h b/libcxx/include/__random/subtract_with_carry_engine.h index 926333cdda45e..e087ab4a3c2c7 100644 --- a/libcxx/include/__random/subtract_with_carry_engine.h +++ b/libcxx/include/__random/subtract_with_carry_engine.h @@ -12,10 +12,10 @@ #include <__algorithm/equal.h> #include <__algorithm/min.h> #include <__config> +#include <__cstddef/size_t.h> #include <__random/is_seed_sequence.h> #include <__random/linear_congruential_engine.h> #include <__type_traits/enable_if.h> -#include #include #include #include diff --git a/libcxx/include/__random/uniform_int_distribution.h b/libcxx/include/__random/uniform_int_distribution.h index 4e3ca3efe5686..fa2c33755b739 100644 --- a/libcxx/include/__random/uniform_int_distribution.h +++ b/libcxx/include/__random/uniform_int_distribution.h @@ -11,11 +11,11 @@ #include <__bit/countl.h> #include <__config> +#include <__cstddef/size_t.h> #include <__random/is_valid.h> #include <__random/log2.h> #include <__type_traits/conditional.h> #include <__type_traits/make_unsigned.h> -#include #include #include #include diff --git a/libcxx/include/__ranges/access.h b/libcxx/include/__ranges/access.h index c0a40c5e10178..bbacef3eae6be 100644 --- a/libcxx/include/__ranges/access.h +++ b/libcxx/include/__ranges/access.h @@ -12,6 +12,7 @@ #include <__concepts/class_or_enum.h> #include <__config> +#include <__cstddef/size_t.h> #include <__iterator/concepts.h> #include <__iterator/readable_traits.h> #include <__ranges/enable_borrowed_range.h> @@ -21,7 +22,6 @@ #include <__type_traits/remove_reference.h> #include <__utility/auto_cast.h> #include <__utility/declval.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__ranges/counted.h b/libcxx/include/__ranges/counted.h index e365deca4e632..65bf1a371ec78 100644 --- a/libcxx/include/__ranges/counted.h +++ b/libcxx/include/__ranges/counted.h @@ -12,6 +12,7 @@ #include <__concepts/convertible_to.h> #include <__config> +#include <__cstddef/size_t.h> #include <__iterator/concepts.h> #include <__iterator/counted_iterator.h> #include <__iterator/default_sentinel.h> @@ -22,7 +23,6 @@ #include <__type_traits/decay.h> #include <__utility/forward.h> #include <__utility/move.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__ranges/drop_view.h b/libcxx/include/__ranges/drop_view.h index bd66371f4ed21..87f66f17a2abe 100644 --- a/libcxx/include/__ranges/drop_view.h +++ b/libcxx/include/__ranges/drop_view.h @@ -15,6 +15,7 @@ #include <__concepts/constructible.h> #include <__concepts/convertible_to.h> #include <__config> +#include <__cstddef/size_t.h> #include <__functional/bind_back.h> #include <__fwd/span.h> #include <__fwd/string_view.h> @@ -42,7 +43,6 @@ #include <__utility/auto_cast.h> #include <__utility/forward.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__ranges/elements_view.h b/libcxx/include/__ranges/elements_view.h index ac0d8dbbd52b7..c99282f37960c 100644 --- a/libcxx/include/__ranges/elements_view.h +++ b/libcxx/include/__ranges/elements_view.h @@ -37,7 +37,6 @@ #include <__utility/declval.h> #include <__utility/forward.h> #include <__utility/move.h> -#include #include // std::get #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__ranges/empty_view.h b/libcxx/include/__ranges/empty_view.h index 6c04b0200c35f..fc08492110f53 100644 --- a/libcxx/include/__ranges/empty_view.h +++ b/libcxx/include/__ranges/empty_view.h @@ -11,10 +11,10 @@ #define _LIBCPP___RANGES_EMPTY_VIEW_H #include <__config> +#include <__cstddef/size_t.h> #include <__ranges/enable_borrowed_range.h> #include <__ranges/view_interface.h> #include <__type_traits/is_object.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__ranges/istream_view.h b/libcxx/include/__ranges/istream_view.h index cd7096d35c2c1..1820ef2a4c1f7 100644 --- a/libcxx/include/__ranges/istream_view.h +++ b/libcxx/include/__ranges/istream_view.h @@ -14,6 +14,7 @@ #include <__concepts/derived_from.h> #include <__concepts/movable.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__fwd/istream.h> #include <__fwd/string.h> #include <__iterator/default_sentinel.h> @@ -22,7 +23,6 @@ #include <__ranges/view_interface.h> #include <__type_traits/remove_cvref.h> #include <__utility/forward.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h index 53e4beb270ad0..93ceaf1711d32 100644 --- a/libcxx/include/__ranges/repeat_view.h +++ b/libcxx/include/__ranges/repeat_view.h @@ -15,6 +15,7 @@ #include <__concepts/same_as.h> #include <__concepts/semiregular.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> #include <__iterator/unreachable_sentinel.h> diff --git a/libcxx/include/__ranges/single_view.h b/libcxx/include/__ranges/single_view.h index 45244f34994d7..955578b99cf58 100644 --- a/libcxx/include/__ranges/single_view.h +++ b/libcxx/include/__ranges/single_view.h @@ -12,6 +12,8 @@ #include <__concepts/constructible.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> +#include <__cstddef/size_t.h> #include <__ranges/movable_box.h> #include <__ranges/range_adaptor.h> #include <__ranges/view_interface.h> @@ -20,7 +22,6 @@ #include <__utility/forward.h> #include <__utility/in_place.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__ranges/size.h b/libcxx/include/__ranges/size.h index 40b0c6b6aad7a..5da3a6ff268e8 100644 --- a/libcxx/include/__ranges/size.h +++ b/libcxx/include/__ranges/size.h @@ -13,6 +13,8 @@ #include <__concepts/arithmetic.h> #include <__concepts/class_or_enum.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> +#include <__cstddef/size_t.h> #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> #include <__ranges/access.h> @@ -22,7 +24,6 @@ #include <__type_traits/remove_cvref.h> #include <__utility/auto_cast.h> #include <__utility/declval.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h index 144746babb325..a40eab3c5a257 100644 --- a/libcxx/include/__ranges/subrange.h +++ b/libcxx/include/__ranges/subrange.h @@ -17,6 +17,7 @@ #include <__concepts/derived_from.h> #include <__concepts/different_from.h> #include <__config> +#include <__cstddef/size_t.h> #include <__fwd/subrange.h> #include <__iterator/advance.h> #include <__iterator/concepts.h> @@ -40,7 +41,6 @@ #include <__type_traits/remove_const.h> #include <__type_traits/remove_pointer.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__ranges/take_view.h b/libcxx/include/__ranges/take_view.h index 8e2d354b58a55..39f99cee6b4da 100644 --- a/libcxx/include/__ranges/take_view.h +++ b/libcxx/include/__ranges/take_view.h @@ -42,7 +42,6 @@ #include <__utility/auto_cast.h> #include <__utility/forward.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__ranges/to.h b/libcxx/include/__ranges/to.h index 52666075da3e2..76249bdd9891c 100644 --- a/libcxx/include/__ranges/to.h +++ b/libcxx/include/__ranges/to.h @@ -15,6 +15,7 @@ #include <__concepts/derived_from.h> #include <__concepts/same_as.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__functional/bind_back.h> #include <__iterator/iterator_traits.h> #include <__ranges/access.h> @@ -30,7 +31,6 @@ #include <__type_traits/type_identity.h> #include <__utility/declval.h> #include <__utility/forward.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer index c4817601039f3..6fc3d9255946c 100644 --- a/libcxx/include/__split_buffer +++ b/libcxx/include/__split_buffer @@ -14,6 +14,7 @@ #include <__algorithm/move.h> #include <__algorithm/move_backward.h> #include <__config> +#include <__cstddef/size_t.h> #include <__iterator/distance.h> #include <__iterator/iterator_traits.h> #include <__iterator/move_iterator.h> @@ -35,7 +36,6 @@ #include <__type_traits/remove_reference.h> #include <__utility/forward.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__stop_token/intrusive_shared_ptr.h b/libcxx/include/__stop_token/intrusive_shared_ptr.h index f00cea5bc2b67..d20c5227ec729 100644 --- a/libcxx/include/__stop_token/intrusive_shared_ptr.h +++ b/libcxx/include/__stop_token/intrusive_shared_ptr.h @@ -13,10 +13,10 @@ #include <__atomic/atomic.h> #include <__atomic/memory_order.h> #include <__config> +#include <__cstddef/nullptr_t.h> #include <__type_traits/is_reference.h> #include <__utility/move.h> #include <__utility/swap.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h index 107f0a96a0227..f59764bb0d048 100644 --- a/libcxx/include/__string/char_traits.h +++ b/libcxx/include/__string/char_traits.h @@ -17,6 +17,7 @@ #include <__assert> #include <__compare/ordering.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__functional/hash.h> #include <__functional/identity.h> #include <__iterator/iterator_traits.h> @@ -24,7 +25,6 @@ #include <__string/constexpr_c_functions.h> #include <__type_traits/is_constant_evaluated.h> #include <__utility/is_pointer_in_range.h> -#include #include #include #include diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h index e62a7b0cd1b3b..f50eac34a1c05 100644 --- a/libcxx/include/__string/constexpr_c_functions.h +++ b/libcxx/include/__string/constexpr_c_functions.h @@ -10,6 +10,7 @@ #define _LIBCPP___STRING_CONSTEXPR_C_FUNCTIONS_H #include <__config> +#include <__cstddef/size_t.h> #include <__memory/addressof.h> #include <__memory/construct_at.h> #include <__type_traits/datasizeof.h> @@ -25,7 +26,6 @@ #include <__type_traits/is_trivially_lexicographically_comparable.h> #include <__type_traits/remove_cv.h> #include <__utility/is_pointer_in_range.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__system_error/error_code.h b/libcxx/include/__system_error/error_code.h index 475f2bb96a56d..6fa673b4ff713 100644 --- a/libcxx/include/__system_error/error_code.h +++ b/libcxx/include/__system_error/error_code.h @@ -17,7 +17,6 @@ #include <__system_error/errc.h> #include <__system_error/error_category.h> #include <__system_error/error_condition.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__system_error/error_condition.h b/libcxx/include/__system_error/error_condition.h index 42898c1f0e901..bfee6528c3f1d 100644 --- a/libcxx/include/__system_error/error_condition.h +++ b/libcxx/include/__system_error/error_condition.h @@ -16,7 +16,6 @@ #include <__functional/unary_function.h> #include <__system_error/errc.h> #include <__system_error/error_category.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__utility/in_place.h b/libcxx/include/__utility/in_place.h index 459b271675261..edaa4e02c55f5 100644 --- a/libcxx/include/__utility/in_place.h +++ b/libcxx/include/__utility/in_place.h @@ -10,9 +10,9 @@ #define _LIBCPP___UTILITY_IN_PLACE_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> #include <__type_traits/remove_cvref.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__utility/integer_sequence.h b/libcxx/include/__utility/integer_sequence.h index ccce9433e7a80..35eb606ee37f6 100644 --- a/libcxx/include/__utility/integer_sequence.h +++ b/libcxx/include/__utility/integer_sequence.h @@ -10,8 +10,8 @@ #define _LIBCPP___UTILITY_INTEGER_SEQUENCE_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/is_integral.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h index 78534a3f399f2..cca6490476db1 100644 --- a/libcxx/include/__utility/pair.h +++ b/libcxx/include/__utility/pair.h @@ -13,6 +13,7 @@ #include <__compare/synth_three_way.h> #include <__concepts/different_from.h> #include <__config> +#include <__cstddef/size_t.h> #include <__fwd/array.h> #include <__fwd/pair.h> #include <__fwd/tuple.h> @@ -43,7 +44,6 @@ #include <__utility/forward.h> #include <__utility/move.h> #include <__utility/piecewise_construct.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__utility/priority_tag.h b/libcxx/include/__utility/priority_tag.h index a159ce7f1afb3..ef7cf162b9b4c 100644 --- a/libcxx/include/__utility/priority_tag.h +++ b/libcxx/include/__utility/priority_tag.h @@ -10,7 +10,7 @@ #define _LIBCPP___UTILITY_PRIORITY_TAG_H #include <__config> -#include +#include <__cstddef/size_t.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__utility/small_buffer.h b/libcxx/include/__utility/small_buffer.h index 9e13797573d2d..70e068f89f62e 100644 --- a/libcxx/include/__utility/small_buffer.h +++ b/libcxx/include/__utility/small_buffer.h @@ -10,13 +10,13 @@ #define _LIBCPP___UTILITY_SMALL_BUFFER_H #include <__config> +#include <__cstddef/byte.h> #include <__memory/construct_at.h> #include <__type_traits/decay.h> #include <__type_traits/is_trivially_constructible.h> #include <__type_traits/is_trivially_destructible.h> #include <__utility/exception_guard.h> #include <__utility/forward.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__utility/swap.h b/libcxx/include/__utility/swap.h index ecfbdec75a2ae..666d6d50f0d91 100644 --- a/libcxx/include/__utility/swap.h +++ b/libcxx/include/__utility/swap.h @@ -10,6 +10,7 @@ #define _LIBCPP___UTILITY_SWAP_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_constructible.h> @@ -18,7 +19,6 @@ #include <__type_traits/is_swappable.h> #include <__utility/declval.h> #include <__utility/move.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/__variant/monostate.h b/libcxx/include/__variant/monostate.h index 16f156609eb7d..c5d2dacaf4205 100644 --- a/libcxx/include/__variant/monostate.h +++ b/libcxx/include/__variant/monostate.h @@ -12,8 +12,8 @@ #include <__compare/ordering.h> #include <__config> +#include <__cstddef/size_t.h> #include <__functional/hash.h> -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/array b/libcxx/include/array index 0e9af4198632d..b1a9f0d29e68a 100644 --- a/libcxx/include/array +++ b/libcxx/include/array @@ -118,6 +118,7 @@ template const T&& get(const array&&) noexce #include <__algorithm/swap_ranges.h> #include <__assert> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__fwd/array.h> #include <__iterator/reverse_iterator.h> #include <__iterator/wrap_iter.h> diff --git a/libcxx/include/atomic b/libcxx/include/atomic index ebd46238eec98..716d198bc236b 100644 --- a/libcxx/include/atomic +++ b/libcxx/include/atomic @@ -620,6 +620,7 @@ template #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include +# include # include # include # include diff --git a/libcxx/include/barrier b/libcxx/include/barrier index abc014e8aaf5c..36c30c7fe2e75 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -52,11 +52,11 @@ namespace std # include <__assert> # include <__atomic/atomic_base.h> # include <__atomic/memory_order.h> +# include <__cstddef/ptrdiff_t.h> # include <__memory/unique_ptr.h> # include <__thread/poll_with_backoff.h> # include <__thread/timed_backoff_policy.h> # include <__utility/move.h> -# include # include # include # include diff --git a/libcxx/include/bitset b/libcxx/include/bitset index 645c172f3be49..9f14b69e7a9b4 100644 --- a/libcxx/include/bitset +++ b/libcxx/include/bitset @@ -136,7 +136,6 @@ template struct hash>; #include <__functional/unary_function.h> #include <__type_traits/is_char_like_type.h> #include -#include #include #include #include diff --git a/libcxx/include/charconv b/libcxx/include/charconv index 29c6875008abb..8f5e697eec439 100644 --- a/libcxx/include/charconv +++ b/libcxx/include/charconv @@ -111,6 +111,7 @@ _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include +# include # include # include # include diff --git a/libcxx/include/compare b/libcxx/include/compare index 8a41835b14897..de0e4c7ec2280 100644 --- a/libcxx/include/compare +++ b/libcxx/include/compare @@ -172,6 +172,7 @@ namespace std { #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include +# include # include #endif diff --git a/libcxx/include/concepts b/libcxx/include/concepts index e89d216a59372..6db2caebaf5b1 100644 --- a/libcxx/include/concepts +++ b/libcxx/include/concepts @@ -158,11 +158,8 @@ namespace std { #include -#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 17 +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include -#endif - -#if _LIBCPP_STD_VER <= 20 && !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) # include #endif diff --git a/libcxx/include/coroutine b/libcxx/include/coroutine index ee54388ad5aaf..18601717768c0 100644 --- a/libcxx/include/coroutine +++ b/libcxx/include/coroutine @@ -59,6 +59,7 @@ struct suspend_always; #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include # include diff --git a/libcxx/include/exception b/libcxx/include/exception index 64463e02cb16a..88eaaf06bf4a2 100644 --- a/libcxx/include/exception +++ b/libcxx/include/exception @@ -89,6 +89,7 @@ template void rethrow_if_nested(const E& e); #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include #endif diff --git a/libcxx/include/experimental/__simd/aligned_tag.h b/libcxx/include/experimental/__simd/aligned_tag.h index e364e146a6011..d208bf5c4fbb9 100644 --- a/libcxx/include/experimental/__simd/aligned_tag.h +++ b/libcxx/include/experimental/__simd/aligned_tag.h @@ -11,9 +11,9 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H #include <__config> +#include <__cstddef/size_t.h> #include <__memory/assume_aligned.h> #include <__type_traits/remove_const.h> -#include #include #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h index 2ac7224159cf3..1b4fcf958516c 100644 --- a/libcxx/include/experimental/__simd/declaration.h +++ b/libcxx/include/experimental/__simd/declaration.h @@ -11,7 +11,7 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H #include <__config> -#include +#include <__cstddef/size_t.h> #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h index cba460baaa95b..b9fe962348adc 100644 --- a/libcxx/include/experimental/__simd/reference.h +++ b/libcxx/include/experimental/__simd/reference.h @@ -11,13 +11,13 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_same.h> #include <__utility/declval.h> #include <__utility/forward.h> #include <__utility/move.h> -#include #include _LIBCPP_PUSH_MACROS diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h index d7ac1225fd789..da318d2f4650f 100644 --- a/libcxx/include/experimental/__simd/scalar.h +++ b/libcxx/include/experimental/__simd/scalar.h @@ -12,8 +12,8 @@ #include <__assert> #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> -#include #include #include diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h index 8d8d96518d97b..fd919e75e32f5 100644 --- a/libcxx/include/experimental/__simd/simd.h +++ b/libcxx/include/experimental/__simd/simd.h @@ -11,12 +11,12 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_integral.h> #include <__type_traits/is_same.h> #include <__type_traits/remove_cvref.h> #include <__utility/forward.h> -#include #include #include #include diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h index 03e9da8519bfb..6b6f671bf3e64 100644 --- a/libcxx/include/experimental/__simd/simd_mask.h +++ b/libcxx/include/experimental/__simd/simd_mask.h @@ -11,9 +11,9 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_same.h> -#include #include #include #include diff --git a/libcxx/include/experimental/__simd/traits.h b/libcxx/include/experimental/__simd/traits.h index b817df604ef72..0bcc2eeba5ce8 100644 --- a/libcxx/include/experimental/__simd/traits.h +++ b/libcxx/include/experimental/__simd/traits.h @@ -12,9 +12,9 @@ #include <__bit/bit_ceil.h> #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_same.h> -#include #include #include diff --git a/libcxx/include/experimental/__simd/utility.h b/libcxx/include/experimental/__simd/utility.h index 01736925d155c..fd9fcecc79865 100644 --- a/libcxx/include/experimental/__simd/utility.h +++ b/libcxx/include/experimental/__simd/utility.h @@ -11,6 +11,7 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_const.h> #include <__type_traits/is_constant_evaluated.h> @@ -21,7 +22,6 @@ #include <__type_traits/void_t.h> #include <__utility/declval.h> #include <__utility/integer_sequence.h> -#include #include #include diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h index 6e8400948d46a..abc7e9595be9c 100644 --- a/libcxx/include/experimental/__simd/vec_ext.h +++ b/libcxx/include/experimental/__simd/vec_ext.h @@ -13,10 +13,10 @@ #include <__assert> #include <__bit/bit_ceil.h> #include <__config> +#include <__cstddef/size_t.h> #include <__type_traits/integral_constant.h> #include <__utility/forward.h> #include <__utility/integer_sequence.h> -#include #include #include #include diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator index e3a9c771fe61c..2488bcfc155a8 100644 --- a/libcxx/include/experimental/iterator +++ b/libcxx/include/experimental/iterator @@ -122,6 +122,7 @@ _LIBCPP_END_NAMESPACE_LFTS _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include #endif diff --git a/libcxx/include/experimental/memory b/libcxx/include/experimental/memory index bf8a154690af0..48e42a0a88a64 100644 --- a/libcxx/include/experimental/memory +++ b/libcxx/include/experimental/memory @@ -50,6 +50,8 @@ public: */ #include <__config> +#include <__cstddef/nullptr_t.h> +#include <__cstddef/size_t.h> #include <__functional/hash.h> #include <__functional/operations.h> #include <__type_traits/add_lvalue_reference.h> @@ -57,7 +59,7 @@ public: #include <__type_traits/common_type.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_convertible.h> -#include +#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -192,6 +194,7 @@ _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP_ENABLE_EXPERIMENTAL #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include #endif diff --git a/libcxx/include/experimental/propagate_const b/libcxx/include/experimental/propagate_const index 510d374bb4bf9..8466d4e9c7ef3 100644 --- a/libcxx/include/experimental/propagate_const +++ b/libcxx/include/experimental/propagate_const @@ -108,6 +108,8 @@ */ #include <__config> +#include <__cstddef/nullptr_t.h> +#include <__cstddef/size_t.h> #include <__functional/operations.h> #include <__fwd/functional.h> #include <__type_traits/conditional.h> @@ -128,7 +130,7 @@ #include <__utility/forward.h> #include <__utility/move.h> #include <__utility/swap.h> -#include +#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -484,6 +486,7 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include #endif diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd index 35120b4b4aab4..1a868513d160b 100644 --- a/libcxx/include/experimental/simd +++ b/libcxx/include/experimental/simd @@ -85,4 +85,8 @@ inline namespace parallelism_v2 { #include #include +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include +#endif + #endif /* _LIBCPP_EXPERIMENTAL_SIMD */ diff --git a/libcxx/include/experimental/type_traits b/libcxx/include/experimental/type_traits index a4bb59afaf4ac..6980fc3c51e46 100644 --- a/libcxx/include/experimental/type_traits +++ b/libcxx/include/experimental/type_traits @@ -148,6 +148,10 @@ constexpr bool is_detected_convertible_v = is_detected_convertible<_To, _Op, _Ar _LIBCPP_END_NAMESPACE_LFTS +# if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include +# endif + #endif /* _LIBCPP_STD_VER >= 14 */ #endif /* _LIBCPP_EXPERIMENTAL_TYPE_TRAITS */ diff --git a/libcxx/include/experimental/utility b/libcxx/include/experimental/utility index cbc7ad140e40c..00151b967e496 100644 --- a/libcxx/include/experimental/utility +++ b/libcxx/include/experimental/utility @@ -43,4 +43,8 @@ struct _LIBCPP_TEMPLATE_VIS erased_type {}; _LIBCPP_END_NAMESPACE_LFTS +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include +#endif + #endif /* _LIBCPP_EXPERIMENTAL_UTILITY */ diff --git a/libcxx/include/initializer_list b/libcxx/include/initializer_list index 680ca1cd20d55..8b9325069c129 100644 --- a/libcxx/include/initializer_list +++ b/libcxx/include/initializer_list @@ -43,7 +43,8 @@ template const E* end(initializer_list il) noexcept; // constexpr in */ #include <__config> -#include +#include <__cstddef/size_t.h> +#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -95,4 +96,8 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Ep* end(initia } // namespace std +#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include +#endif + #endif // _LIBCPP_INITIALIZER_LIST diff --git a/libcxx/include/iterator b/libcxx/include/iterator index fca75f0a19ed1..63fbae220b604 100644 --- a/libcxx/include/iterator +++ b/libcxx/include/iterator @@ -743,6 +743,7 @@ template constexpr const E* data(initializer_list il) noexcept; #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include # include diff --git a/libcxx/include/latch b/libcxx/include/latch index b56e49bc768bf..92dadf68bcaab 100644 --- a/libcxx/include/latch +++ b/libcxx/include/latch @@ -48,7 +48,7 @@ namespace std # include <__atomic/atomic_base.h> # include <__atomic/atomic_sync.h> # include <__atomic/memory_order.h> -# include +# include <__cstddef/ptrdiff_t.h> # include # include @@ -124,6 +124,7 @@ _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include +# include #endif #endif // _LIBCPP_LATCH diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index af8c3c15eb276..70f91249a58ec 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -13,7 +13,10 @@ module std_config [system] { module std_core [system] { module cstddef { module byte { header "__cstddef/byte.h" } - module max_align_t { header "__cstddef/max_align_t.h" } + module max_align_t { + header "__cstddef/max_align_t.h" + export * + } module nullptr_t { header "__cstddef/nullptr_t.h" } module ptrdiff_t { header "__cstddef/ptrdiff_t.h" } module size_t { header "__cstddef/size_t.h" } @@ -1408,7 +1411,10 @@ module std [system] { module indirectly_comparable { header "__iterator/indirectly_comparable.h" } module insert_iterator { header "__iterator/insert_iterator.h" } module istream_iterator { header "__iterator/istream_iterator.h" } - module istreambuf_iterator { header "__iterator/istreambuf_iterator.h" } + module istreambuf_iterator { + header "__iterator/istreambuf_iterator.h" + export std.string.char_traits + } module iter_move { header "__iterator/iter_move.h" } module iter_swap { header "__iterator/iter_swap.h" } module iterator_traits { diff --git a/libcxx/include/mutex b/libcxx/include/mutex index 02c52dd72f02b..427fce5f3ec46 100644 --- a/libcxx/include/mutex +++ b/libcxx/include/mutex @@ -199,7 +199,6 @@ template #include <__thread/id.h> #include <__thread/support.h> #include <__utility/forward.h> -#include #include #ifndef _LIBCPP_CXX03_LANG # include diff --git a/libcxx/include/new b/libcxx/include/new index 75e2b8742df6b..290ad9e97f8de 100644 --- a/libcxx/include/new +++ b/libcxx/include/new @@ -87,12 +87,12 @@ void operator delete[](void* ptr, void*) noexcept; */ #include <__config> +#include <__cstddef/size_t.h> #include <__exception/exception.h> #include <__type_traits/is_function.h> #include <__type_traits/is_same.h> #include <__type_traits/remove_cv.h> #include <__verbose_abort> -#include #include #if defined(_LIBCPP_ABI_VCRUNTIME) @@ -367,6 +367,7 @@ inline constexpr size_t hardware_constructive_interference_size = __GCC_CONSTRUC _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include #endif diff --git a/libcxx/include/numbers b/libcxx/include/numbers index f48ba4baf38ff..191563c3d8a5f 100644 --- a/libcxx/include/numbers +++ b/libcxx/include/numbers @@ -158,6 +158,7 @@ _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include +# include # include #endif diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore index bf6317c587e2f..98122c96459a2 100644 --- a/libcxx/include/semaphore +++ b/libcxx/include/semaphore @@ -54,10 +54,10 @@ using binary_semaphore = counting_semaphore<1>; // since C++20 # include <__atomic/atomic_sync.h> # include <__atomic/memory_order.h> # include <__chrono/time_point.h> +# include <__cstddef/ptrdiff_t.h> # include <__thread/poll_with_backoff.h> # include <__thread/support.h> # include <__thread/timed_backoff_policy.h> -# include # include # include @@ -181,6 +181,7 @@ _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include +# include #endif #endif // _LIBCPP_SEMAPHORE diff --git a/libcxx/include/span b/libcxx/include/span index a32f7a372e2ae..896a3cd890186 100644 --- a/libcxx/include/span +++ b/libcxx/include/span @@ -148,6 +148,8 @@ template #include <__concepts/convertible_to.h> #include <__concepts/equality_comparable.h> #include <__config> +#include <__cstddef/byte.h> +#include <__cstddef/ptrdiff_t.h> #include <__fwd/array.h> #include <__fwd/span.h> #include <__iterator/bounded_iter.h> @@ -173,7 +175,6 @@ template #include <__type_traits/remove_reference.h> #include <__type_traits/type_identity.h> #include <__utility/forward.h> -#include // for byte #include #include #include diff --git a/libcxx/include/stdexcept b/libcxx/include/stdexcept index daa7b501a8699..8415d3339f7e3 100644 --- a/libcxx/include/stdexcept +++ b/libcxx/include/stdexcept @@ -278,6 +278,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include # include diff --git a/libcxx/include/stop_token b/libcxx/include/stop_token index d4e651d9541f4..cf8d0cf9b9191 100644 --- a/libcxx/include/stop_token +++ b/libcxx/include/stop_token @@ -50,6 +50,7 @@ namespace std { #endif // !defined(_LIBCPP_HAS_NO_THREADS) #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include #endif diff --git a/libcxx/include/string_view b/libcxx/include/string_view index 5beac404fb5b5..0edda7aeb1a77 100644 --- a/libcxx/include/string_view +++ b/libcxx/include/string_view @@ -208,6 +208,8 @@ namespace std { #include <__algorithm/min.h> #include <__assert> #include <__config> +#include <__cstddef/ptrdiff_t.h> +#include <__cstddef/size_t.h> #include <__functional/hash.h> #include <__functional/unary_function.h> #include <__fwd/ostream.h> @@ -233,7 +235,6 @@ namespace std { #include <__type_traits/remove_cvref.h> #include <__type_traits/remove_reference.h> #include <__type_traits/type_identity.h> -#include #include #include #include diff --git a/libcxx/include/tuple b/libcxx/include/tuple index e7e14b8d12d42..c3f7b8041686d 100644 --- a/libcxx/include/tuple +++ b/libcxx/include/tuple @@ -214,6 +214,7 @@ template #include <__compare/ordering.h> #include <__compare/synth_three_way.h> #include <__config> +#include <__cstddef/size_t.h> #include <__fwd/array.h> #include <__fwd/pair.h> #include <__fwd/tuple.h> @@ -262,7 +263,6 @@ template #include <__utility/move.h> #include <__utility/piecewise_construct.h> #include <__utility/swap.h> -#include #include // standard-mandated includes @@ -1411,6 +1411,7 @@ _LIBCPP_POP_MACROS // clang-format on #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include # include diff --git a/libcxx/include/typeindex b/libcxx/include/typeindex index 6398aa40d616a..9f8e65befcba4 100644 --- a/libcxx/include/typeindex +++ b/libcxx/include/typeindex @@ -98,6 +98,7 @@ struct _LIBCPP_TEMPLATE_VIS hash : public __unary_function # include # include # include diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo index 252afe59a0aa6..28713077c688c 100644 --- a/libcxx/include/typeinfo +++ b/libcxx/include/typeinfo @@ -57,12 +57,13 @@ public: */ #include <__config> +#include <__cstddef/size_t.h> #include <__exception/exception.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_constant_evaluated.h> #include <__verbose_abort> -#include #include +#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -383,6 +384,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_END_NAMESPACE_STD #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include #endif diff --git a/libcxx/include/utility b/libcxx/include/utility index f97907fbf72e9..138cc3ba31893 100644 --- a/libcxx/include/utility +++ b/libcxx/include/utility @@ -301,6 +301,7 @@ template #endif #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include # include diff --git a/libcxx/include/valarray b/libcxx/include/valarray index b3b48958f92bd..2f7a1a7c5b49e 100644 --- a/libcxx/include/valarray +++ b/libcxx/include/valarray @@ -352,6 +352,7 @@ template unspecified2 end(const valarray& v); #include <__algorithm/unwrap_iter.h> #include <__assert> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__functional/operations.h> #include <__memory/addressof.h> #include <__memory/allocator.h> @@ -361,7 +362,6 @@ template unspecified2 end(const valarray& v); #include <__utility/move.h> #include <__utility/swap.h> #include -#include #include #include diff --git a/libcxx/include/variant b/libcxx/include/variant index 2e158a4eea314..ee80fb0b5ab5b 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -1631,6 +1631,7 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 +# include # include # include # include diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp index 299f810948fd1..3d0d4ead1be74 100644 --- a/libcxx/src/memory_resource.cpp +++ b/libcxx/src/memory_resource.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include diff --git a/libcxx/test/benchmarks/CMakeLists.txt b/libcxx/test/benchmarks/CMakeLists.txt index b402af8d80dae..5e829a639d39a 100644 --- a/libcxx/test/benchmarks/CMakeLists.txt +++ b/libcxx/test/benchmarks/CMakeLists.txt @@ -157,6 +157,7 @@ set(BENCHMARK_TESTS formatter_float.bench.cpp formatter_int.bench.cpp function.bench.cpp + hash.bench.cpp join_view.bench.cpp lexicographical_compare_three_way.bench.cpp map.bench.cpp diff --git a/libcxx/test/benchmarks/hash.bench.cpp b/libcxx/test/benchmarks/hash.bench.cpp new file mode 100644 index 0000000000000..e015987447629 --- /dev/null +++ b/libcxx/test/benchmarks/hash.bench.cpp @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03 + +#include +#include +#include + +#include "benchmark/benchmark.h" + +#include "GenerateInput.h" +#include "test_macros.h" + +constexpr std::size_t TestNumInputs = 1024; + +template +inline TEST_ALWAYS_INLINE _Size loadword(const void* __p) { + _Size __r; + std::memcpy(&__r, __p, sizeof(__r)); + return __r; +} + +inline TEST_ALWAYS_INLINE std::size_t hash_len_16(std::size_t __u, std::size_t __v) { + const std::size_t __mul = 0x9ddfea08eb382d69ULL; + std::size_t __a = (__u ^ __v) * __mul; + __a ^= (__a >> 47); + std::size_t __b = (__v ^ __a) * __mul; + __b ^= (__b >> 47); + __b *= __mul; + return __b; +} + +template +inline TEST_ALWAYS_INLINE std::size_t hash_len_0_to_8(const char* __s) { + static_assert(_Len == 4 || _Len == 8, ""); + const uint64_t __a = loadword(__s); + const uint64_t __b = loadword(__s + _Len - 4); + return hash_len_16(_Len + (__a << 3), __b); +} + +struct UInt32Hash { + UInt32Hash() = default; + inline TEST_ALWAYS_INLINE std::size_t operator()(uint32_t data) const { + return hash_len_0_to_8<4>(reinterpret_cast(&data)); + } +}; + +template +void BM_Hash(benchmark::State& st, HashFn fn, GenInputs gen) { + auto in = gen(st.range(0)); + const auto end = in.data() + in.size(); + std::size_t last_hash = 0; + benchmark::DoNotOptimize(&last_hash); + while (st.KeepRunning()) { + for (auto it = in.data(); it != end; ++it) { + benchmark::DoNotOptimize(last_hash += fn(*it)); + } + benchmark::ClobberMemory(); + } +} + +BENCHMARK_CAPTURE(BM_Hash, uint32_random_std_hash, std::hash{}, getRandomIntegerInputs) + ->Arg(TestNumInputs); + +BENCHMARK_CAPTURE(BM_Hash, uint32_random_custom_hash, UInt32Hash{}, getRandomIntegerInputs) + ->Arg(TestNumInputs); + +BENCHMARK_CAPTURE(BM_Hash, uint32_top_std_hash, std::hash{}, getSortedTopBitsIntegerInputs) + ->Arg(TestNumInputs); + +BENCHMARK_CAPTURE(BM_Hash, uint32_top_custom_hash, UInt32Hash{}, getSortedTopBitsIntegerInputs) + ->Arg(TestNumInputs); + +BENCHMARK_MAIN(); diff --git a/libcxx/test/benchmarks/unordered_set_operations.bench.cpp b/libcxx/test/benchmarks/unordered_set_operations.bench.cpp index bcf6adda2e021..2e42d6d345b99 100644 --- a/libcxx/test/benchmarks/unordered_set_operations.bench.cpp +++ b/libcxx/test/benchmarks/unordered_set_operations.bench.cpp @@ -95,13 +95,6 @@ struct UInt32Hash2 { } }; -struct UInt64Hash2 { - UInt64Hash2() = default; - inline TEST_ALWAYS_INLINE std::size_t operator()(uint64_t data) const { - return hash_len_0_to_8<8>(reinterpret_cast(&data)); - } -}; - // The sole purpose of this comparator is to be used in BM_Rehash, where // we need something slow enough to be easily noticable in benchmark results. // The default implementation of operator== for strings seems to be a little @@ -123,36 +116,6 @@ struct SlowStringEq { } }; -//----------------------------------------------------------------------------// -// BM_Hash -// ---------------------------------------------------------------------------// - -template -void BM_Hash(benchmark::State& st, HashFn fn, GenInputs gen) { - auto in = gen(st.range(0)); - const auto end = in.data() + in.size(); - std::size_t last_hash = 0; - benchmark::DoNotOptimize(&last_hash); - while (st.KeepRunning()) { - for (auto it = in.data(); it != end; ++it) { - benchmark::DoNotOptimize(last_hash += fn(*it)); - } - benchmark::ClobberMemory(); - } -} - -BENCHMARK_CAPTURE(BM_Hash, uint32_random_std_hash, std::hash{}, getRandomIntegerInputs) - ->Arg(TestNumInputs); - -BENCHMARK_CAPTURE(BM_Hash, uint32_random_custom_hash, UInt32Hash{}, getRandomIntegerInputs) - ->Arg(TestNumInputs); - -BENCHMARK_CAPTURE(BM_Hash, uint32_top_std_hash, std::hash{}, getSortedTopBitsIntegerInputs) - ->Arg(TestNumInputs); - -BENCHMARK_CAPTURE(BM_Hash, uint32_top_custom_hash, UInt32Hash{}, getSortedTopBitsIntegerInputs) - ->Arg(TestNumInputs); - //----------------------------------------------------------------------------// // BM_InsertValue // ---------------------------------------------------------------------------// diff --git a/libcxx/test/libcxx/algorithms/alg.modifying.operations/alg.random.shuffle/random_shuffle.cxx1z.pass.cpp b/libcxx/test/libcxx/algorithms/alg.modifying.operations/alg.random.shuffle/random_shuffle.cxx1z.pass.cpp index c1acc100a6609..4e51014f20b18 100644 --- a/libcxx/test/libcxx/algorithms/alg.modifying.operations/alg.random.shuffle/random_shuffle.cxx1z.pass.cpp +++ b/libcxx/test/libcxx/algorithms/alg.modifying.operations/alg.random.shuffle/random_shuffle.cxx1z.pass.cpp @@ -26,6 +26,7 @@ // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS #include +#include #include #include "test_macros.h" diff --git a/libcxx/test/libcxx/algorithms/alg.modifying.operations/copy_move_unwrap_reverse.pass.cpp b/libcxx/test/libcxx/algorithms/alg.modifying.operations/copy_move_unwrap_reverse.pass.cpp index f295b807864ea..2a85e7b5ddcc3 100644 --- a/libcxx/test/libcxx/algorithms/alg.modifying.operations/copy_move_unwrap_reverse.pass.cpp +++ b/libcxx/test/libcxx/algorithms/alg.modifying.operations/copy_move_unwrap_reverse.pass.cpp @@ -15,9 +15,9 @@ #include #include +#include #include #include -#include #include #include "test_iterators.h" diff --git a/libcxx/test/libcxx/algorithms/robust_against_using_non_transparent_comparators.pass.cpp b/libcxx/test/libcxx/algorithms/robust_against_using_non_transparent_comparators.pass.cpp index eaa5d44385be6..39870ebe7ff02 100644 --- a/libcxx/test/libcxx/algorithms/robust_against_using_non_transparent_comparators.pass.cpp +++ b/libcxx/test/libcxx/algorithms/robust_against_using_non_transparent_comparators.pass.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include "test_macros.h" diff --git a/libcxx/test/libcxx/containers/sequences/deque/asan_turning_off.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/asan_turning_off.pass.cpp index e9b9cde64ee91..b31775a873481 100644 --- a/libcxx/test/libcxx/containers/sequences/deque/asan_turning_off.pass.cpp +++ b/libcxx/test/libcxx/containers/sequences/deque/asan_turning_off.pass.cpp @@ -17,6 +17,7 @@ // This test confirms that those allocators work after turning off annotations. #include +#include #include #include diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py index 2693617bcb0e5..f01dbac26a8e8 100644 --- a/libcxx/test/libcxx/transitive_includes.gen.py +++ b/libcxx/test/libcxx/transitive_includes.gen.py @@ -73,7 +73,7 @@ {lit_header_restrictions.get(header, '')} // TODO: Fix this test to make it work with localization or wide characters disabled -// UNSUPPORTED: no-localization, no-wide-characters, no-threads, no-filesystem, libcpp-has-no-experimental-tzdb, no-tzdb +// UNSUPPORTED: no-localization, no-wide-characters, no-threads, no-filesystem, libcpp-has-no-experimental-tzdb // When built with modules, this test doesn't work because --trace-includes doesn't // report the stack of includes correctly. diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv index 48c501863cb76..ae4254183fc84 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx03.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv @@ -7,6 +7,7 @@ algorithm compare algorithm concepts algorithm cstddef algorithm cstdint +algorithm cstdio algorithm cstdlib algorithm cstring algorithm ctime @@ -86,6 +87,7 @@ array compare array concepts array cstddef array cstdint +array cstdio array cstdlib array cstring array ctime @@ -1181,13 +1183,17 @@ istream utility istream variant istream vector istream version +iterator cctype iterator cmath iterator compare iterator concepts iterator cstddef iterator cstdint +iterator cstdio iterator cstdlib iterator cstring +iterator cwchar +iterator cwctype iterator exception iterator initializer_list iterator iosfwd @@ -1397,15 +1403,19 @@ mdspan variant mdspan vector mdspan version memory atomic +memory cctype memory climits memory cmath memory compare memory concepts memory cstddef memory cstdint +memory cstdio memory cstdlib memory cstring memory ctime +memory cwchar +memory cwctype memory exception memory initializer_list memory iosfwd @@ -1551,15 +1561,19 @@ numeric variant numeric vector numeric version optional atomic +optional cctype optional climits optional cmath optional compare optional concepts optional cstddef optional cstdint +optional cstdio optional cstdlib optional cstring optional ctime +optional cwchar +optional cwctype optional exception optional initializer_list optional iosfwd @@ -1871,15 +1885,19 @@ regex variant regex vector regex version scoped_allocator atomic +scoped_allocator cctype scoped_allocator climits scoped_allocator cmath scoped_allocator compare scoped_allocator concepts scoped_allocator cstddef scoped_allocator cstdint +scoped_allocator cstdio scoped_allocator cstdlib scoped_allocator cstring scoped_allocator ctime +scoped_allocator cwchar +scoped_allocator cwctype scoped_allocator exception scoped_allocator initializer_list scoped_allocator iosfwd @@ -2499,6 +2517,7 @@ unordered_map compare unordered_map concepts unordered_map cstddef unordered_map cstdint +unordered_map cstdio unordered_map cstdlib unordered_map cstring unordered_map ctime diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv index 48c501863cb76..ae4254183fc84 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx11.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv @@ -7,6 +7,7 @@ algorithm compare algorithm concepts algorithm cstddef algorithm cstdint +algorithm cstdio algorithm cstdlib algorithm cstring algorithm ctime @@ -86,6 +87,7 @@ array compare array concepts array cstddef array cstdint +array cstdio array cstdlib array cstring array ctime @@ -1181,13 +1183,17 @@ istream utility istream variant istream vector istream version +iterator cctype iterator cmath iterator compare iterator concepts iterator cstddef iterator cstdint +iterator cstdio iterator cstdlib iterator cstring +iterator cwchar +iterator cwctype iterator exception iterator initializer_list iterator iosfwd @@ -1397,15 +1403,19 @@ mdspan variant mdspan vector mdspan version memory atomic +memory cctype memory climits memory cmath memory compare memory concepts memory cstddef memory cstdint +memory cstdio memory cstdlib memory cstring memory ctime +memory cwchar +memory cwctype memory exception memory initializer_list memory iosfwd @@ -1551,15 +1561,19 @@ numeric variant numeric vector numeric version optional atomic +optional cctype optional climits optional cmath optional compare optional concepts optional cstddef optional cstdint +optional cstdio optional cstdlib optional cstring optional ctime +optional cwchar +optional cwctype optional exception optional initializer_list optional iosfwd @@ -1871,15 +1885,19 @@ regex variant regex vector regex version scoped_allocator atomic +scoped_allocator cctype scoped_allocator climits scoped_allocator cmath scoped_allocator compare scoped_allocator concepts scoped_allocator cstddef scoped_allocator cstdint +scoped_allocator cstdio scoped_allocator cstdlib scoped_allocator cstring scoped_allocator ctime +scoped_allocator cwchar +scoped_allocator cwctype scoped_allocator exception scoped_allocator initializer_list scoped_allocator iosfwd @@ -2499,6 +2517,7 @@ unordered_map compare unordered_map concepts unordered_map cstddef unordered_map cstdint +unordered_map cstdio unordered_map cstdlib unordered_map cstring unordered_map ctime diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv index 6191c9012c631..f14b317000370 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx14.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv @@ -7,6 +7,7 @@ algorithm compare algorithm concepts algorithm cstddef algorithm cstdint +algorithm cstdio algorithm cstdlib algorithm cstring algorithm ctime @@ -88,6 +89,7 @@ array compare array concepts array cstddef array cstdint +array cstdio array cstdlib array cstring array ctime @@ -1209,13 +1211,17 @@ istream utility istream variant istream vector istream version +iterator cctype iterator cmath iterator compare iterator concepts iterator cstddef iterator cstdint +iterator cstdio iterator cstdlib iterator cstring +iterator cwchar +iterator cwctype iterator exception iterator initializer_list iterator iosfwd @@ -1429,15 +1435,19 @@ mdspan variant mdspan vector mdspan version memory atomic +memory cctype memory climits memory cmath memory compare memory concepts memory cstddef memory cstdint +memory cstdio memory cstdlib memory cstring memory ctime +memory cwchar +memory cwctype memory exception memory initializer_list memory iosfwd @@ -1585,15 +1595,19 @@ numeric variant numeric vector numeric version optional atomic +optional cctype optional climits optional cmath optional compare optional concepts optional cstddef optional cstdint +optional cstdio optional cstdlib optional cstring optional ctime +optional cwchar +optional cwctype optional exception optional initializer_list optional iosfwd @@ -1910,15 +1924,19 @@ regex variant regex vector regex version scoped_allocator atomic +scoped_allocator cctype scoped_allocator climits scoped_allocator cmath scoped_allocator compare scoped_allocator concepts scoped_allocator cstddef scoped_allocator cstdint +scoped_allocator cstdio scoped_allocator cstdlib scoped_allocator cstring scoped_allocator ctime +scoped_allocator cwchar +scoped_allocator cwctype scoped_allocator exception scoped_allocator initializer_list scoped_allocator iosfwd @@ -2550,6 +2568,7 @@ unordered_map compare unordered_map concepts unordered_map cstddef unordered_map cstdint +unordered_map cstdio unordered_map cstdlib unordered_map cstring unordered_map ctime diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv index 5d46162e3f899..d4bc0a38c1643 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx17.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv @@ -7,6 +7,7 @@ algorithm compare algorithm concepts algorithm cstddef algorithm cstdint +algorithm cstdio algorithm cstdlib algorithm cstring algorithm ctime @@ -86,6 +87,7 @@ array compare array concepts array cstddef array cstdint +array cstdio array cstdlib array cstring array ctime @@ -1205,13 +1207,17 @@ istream utility istream variant istream vector istream version +iterator cctype iterator cmath iterator compare iterator concepts iterator cstddef iterator cstdint +iterator cstdio iterator cstdlib iterator cstring +iterator cwchar +iterator cwctype iterator exception iterator initializer_list iterator iosfwd @@ -1421,15 +1427,19 @@ mdspan variant mdspan vector mdspan version memory atomic +memory cctype memory climits memory cmath memory compare memory concepts memory cstddef memory cstdint +memory cstdio memory cstdlib memory cstring memory ctime +memory cwchar +memory cwctype memory exception memory initializer_list memory iosfwd @@ -1575,15 +1585,19 @@ numeric variant numeric vector numeric version optional atomic +optional cctype optional climits optional cmath optional compare optional concepts optional cstddef optional cstdint +optional cstdio optional cstdlib optional cstring optional ctime +optional cwchar +optional cwctype optional exception optional initializer_list optional iosfwd @@ -1895,15 +1909,19 @@ regex variant regex vector regex version scoped_allocator atomic +scoped_allocator cctype scoped_allocator climits scoped_allocator cmath scoped_allocator compare scoped_allocator concepts scoped_allocator cstddef scoped_allocator cstdint +scoped_allocator cstdio scoped_allocator cstdlib scoped_allocator cstring scoped_allocator ctime +scoped_allocator cwchar +scoped_allocator cwctype scoped_allocator exception scoped_allocator initializer_list scoped_allocator iosfwd @@ -2523,6 +2541,7 @@ unordered_map compare unordered_map concepts unordered_map cstddef unordered_map cstdint +unordered_map cstdio unordered_map cstdlib unordered_map cstring unordered_map ctime diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv index 20fe9878ce3ea..304166547abf3 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx20.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv @@ -7,6 +7,7 @@ algorithm compare algorithm concepts algorithm cstddef algorithm cstdint +algorithm cstdio algorithm cstdlib algorithm cstring algorithm ctime @@ -29,15 +30,19 @@ algorithm utility algorithm variant algorithm version any atomic +any cctype any climits any cmath any compare any concepts any cstddef any cstdint +any cstdio any cstdlib any cstring any ctime +any cwchar +any cwctype any exception any initializer_list any iosfwd @@ -63,6 +68,7 @@ array compare array concepts array cstddef array cstdint +array cstdio array cstdlib array cstring array ctime @@ -97,15 +103,19 @@ atomic ratio atomic type_traits atomic version barrier atomic +barrier cctype barrier climits barrier cmath barrier compare barrier concepts barrier cstddef barrier cstdint +barrier cstdio barrier cstdlib barrier cstring barrier ctime +barrier cwchar +barrier cwctype barrier exception barrier initializer_list barrier iosfwd @@ -1191,13 +1201,17 @@ istream utility istream variant istream vector istream version +iterator cctype iterator cmath iterator compare iterator concepts iterator cstddef iterator cstdint +iterator cstdio iterator cstdlib iterator cstring +iterator cwchar +iterator cwctype iterator exception iterator initializer_list iterator iosfwd @@ -1407,15 +1421,19 @@ mdspan variant mdspan vector mdspan version memory atomic +memory cctype memory climits memory cmath memory compare memory concepts memory cstddef memory cstdint +memory cstdio memory cstdlib memory cstring memory ctime +memory cwchar +memory cwctype memory exception memory initializer_list memory iosfwd @@ -1561,15 +1579,19 @@ numeric variant numeric vector numeric version optional atomic +optional cctype optional climits optional cmath optional compare optional concepts optional cstddef optional cstdint +optional cstdio optional cstdlib optional cstring optional ctime +optional cwchar +optional cwctype optional exception optional initializer_list optional iosfwd @@ -1881,15 +1903,19 @@ regex variant regex vector regex version scoped_allocator atomic +scoped_allocator cctype scoped_allocator climits scoped_allocator cmath scoped_allocator compare scoped_allocator concepts scoped_allocator cstddef scoped_allocator cstdint +scoped_allocator cstdio scoped_allocator cstdlib scoped_allocator cstring scoped_allocator ctime +scoped_allocator cwchar +scoped_allocator cwctype scoped_allocator exception scoped_allocator initializer_list scoped_allocator iosfwd @@ -2519,6 +2545,7 @@ unordered_map compare unordered_map concepts unordered_map cstddef unordered_map cstdint +unordered_map cstdio unordered_map cstdlib unordered_map cstring unordered_map ctime diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 5ee89ec307cc2..48d4425c0333f 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -1,7 +1,6 @@ algorithm cctype algorithm climits algorithm compare -algorithm cstddef algorithm cstdint algorithm cstring algorithm ctime @@ -15,7 +14,6 @@ algorithm optional algorithm ratio algorithm tuple algorithm version -any cstddef any cstdint any cstring any initializer_list @@ -25,7 +23,6 @@ any typeinfo any version array cctype array compare -array cstddef array cstdint array cwchar array cwctype @@ -35,7 +32,6 @@ array new array stdexcept array version atomic climits -atomic cstddef atomic cstdint atomic cstring atomic ctime @@ -43,7 +39,6 @@ atomic limits atomic ratio atomic version barrier climits -barrier cstddef barrier cstdint barrier cstring barrier ctime @@ -57,7 +52,6 @@ bit version bitset cctype bitset climits bitset compare -bitset cstddef bitset cstdint bitset cstdio bitset cstring @@ -105,7 +99,6 @@ ccomplex tuple ccomplex typeinfo ccomplex version charconv cerrno -charconv cstddef charconv cstdint charconv initializer_list charconv limits @@ -170,7 +163,6 @@ codecvt string_view codecvt tuple codecvt typeinfo codecvt version -compare cstddef compare cstdint compare limits compare version @@ -205,14 +197,12 @@ complex string_view complex tuple complex typeinfo complex version -concepts cstddef concepts version condition_variable atomic condition_variable cctype condition_variable cerrno condition_variable climits condition_variable compare -condition_variable cstddef condition_variable cstdint condition_variable cstdio condition_variable cstring @@ -231,7 +221,6 @@ condition_variable tuple condition_variable typeinfo condition_variable version coroutine compare -coroutine cstddef coroutine cstdint coroutine cstring coroutine limits @@ -274,7 +263,6 @@ cwchar cwctype cwctype cctype deque cctype deque compare -deque cstddef deque cstdint deque cstring deque cwchar @@ -285,14 +273,12 @@ deque new deque stdexcept deque tuple deque version -exception cstddef exception cstdint exception cstdlib exception new exception typeinfo exception version execution version -expected cstddef expected cstdint expected initializer_list expected new @@ -328,23 +314,18 @@ experimental/iterator tuple experimental/iterator typeinfo experimental/iterator variant experimental/iterator version -experimental/memory cstddef experimental/memory cstdint experimental/memory cstring experimental/memory version -experimental/propagate_const cstddef experimental/propagate_const version -experimental/simd cstddef experimental/simd cstdint experimental/simd limits experimental/simd version -experimental/type_traits cstddef experimental/type_traits cstdint experimental/type_traits initializer_list experimental/type_traits type_traits experimental/type_traits version experimental/utility compare -experimental/utility cstddef experimental/utility cstdint experimental/utility initializer_list experimental/utility limits @@ -432,7 +413,6 @@ format typeinfo format version forward_list cctype forward_list compare -forward_list cstddef forward_list cstdint forward_list cwchar forward_list cwctype @@ -481,7 +461,6 @@ fstream version functional array functional cctype functional compare -functional cstddef functional cstdint functional cstring functional cwchar @@ -529,7 +508,6 @@ future thread future tuple future typeinfo future version -initializer_list cstddef initializer_list version iomanip bitset iomanip cctype @@ -648,11 +626,14 @@ istream string_view istream tuple istream typeinfo istream version +iterator cctype iterator compare iterator concepts -iterator cstddef iterator cstdint +iterator cstdio iterator cstring +iterator cwchar +iterator cwctype iterator initializer_list iterator iosfwd iterator limits @@ -660,7 +641,6 @@ iterator new iterator variant iterator version latch climits -latch cstddef latch cstdint latch cstring latch ctime @@ -670,7 +650,6 @@ latch version limits version list cctype list compare -list cstddef list cstdint list cstring list cwchar @@ -708,7 +687,6 @@ locale typeinfo locale version map cctype map compare -map cstddef map cstdint map cstring map cwchar @@ -722,10 +700,8 @@ map tuple map version mdspan array mdspan cctype -mdspan cinttypes mdspan compare mdspan concepts -mdspan cstddef mdspan cstdint mdspan cwchar mdspan cwctype @@ -736,7 +712,6 @@ mdspan span mdspan stdexcept mdspan version memory compare -memory cstddef memory cstdint memory cstring memory initializer_list @@ -749,7 +724,6 @@ memory_resource cctype memory_resource cerrno memory_resource climits memory_resource compare -memory_resource cstddef memory_resource cstdint memory_resource cstdio memory_resource cstring @@ -772,7 +746,6 @@ mutex cctype mutex cerrno mutex climits mutex compare -mutex cstddef mutex cstdint mutex cstdio mutex cstring @@ -790,12 +763,10 @@ mutex string_view mutex tuple mutex typeinfo mutex version -new cstddef new version numbers version numeric climits numeric compare -numeric cstddef numeric cstdint numeric cstring numeric ctime @@ -807,7 +778,6 @@ numeric ratio numeric tuple numeric version optional compare -optional cstddef optional cstdint optional cstring optional initializer_list @@ -902,7 +872,6 @@ random cctype random climits random cmath random compare -random cstddef random cstdint random cstdio random cstring @@ -924,8 +893,8 @@ random version ranges cctype ranges compare ranges concepts -ranges cstddef ranges cstdint +ranges cstdio ranges cstring ranges cwchar ranges cwctype @@ -969,14 +938,12 @@ regex typeinfo regex vector regex version scoped_allocator compare -scoped_allocator cstddef scoped_allocator cstdint scoped_allocator limits scoped_allocator new scoped_allocator tuple scoped_allocator version semaphore climits -semaphore cstddef semaphore cstdint semaphore cstring semaphore ctime @@ -985,7 +952,6 @@ semaphore ratio semaphore version set cctype set compare -set cstddef set cstdint set cstring set cwchar @@ -1000,7 +966,6 @@ shared_mutex cctype shared_mutex cerrno shared_mutex climits shared_mutex compare -shared_mutex cstddef shared_mutex cstdint shared_mutex cstdio shared_mutex cstring @@ -1019,7 +984,6 @@ shared_mutex tuple shared_mutex version source_location cstdint source_location version -span cstddef span initializer_list span limits span stdexcept @@ -1055,7 +1019,6 @@ sstream typeinfo sstream version stack cctype stack compare -stack cstddef stack cstdint stack cstring stack cwchar @@ -1069,7 +1032,6 @@ stack tuple stack version stop_token atomic stop_token climits -stop_token cstddef stop_token cstdint stop_token cstring stop_token ctime @@ -1104,7 +1066,6 @@ streambuf version string cctype string climits string compare -string cstddef string cstdint string cstdio string cstring @@ -1120,7 +1081,6 @@ string tuple string version string_view cctype string_view compare -string_view cstddef string_view cstdint string_view cstdio string_view cstring @@ -1208,7 +1168,6 @@ system_error cctype system_error cerrno system_error climits system_error compare -system_error cstddef system_error cstdint system_error cstdio system_error cstring @@ -1256,23 +1215,19 @@ thread tuple thread typeinfo thread version tuple compare -tuple cstddef tuple cstdint tuple limits tuple version type_traits cstdint type_traits version typeindex compare -typeindex cstddef typeindex cstdint typeindex limits typeindex typeinfo typeindex version -typeinfo cstddef typeinfo cstdint typeinfo version unordered_map compare -unordered_map cstddef unordered_map cstdint unordered_map cstring unordered_map initializer_list @@ -1283,7 +1238,6 @@ unordered_map stdexcept unordered_map tuple unordered_map version unordered_set compare -unordered_set cstddef unordered_set cstdint unordered_set cstring unordered_set initializer_list @@ -1293,20 +1247,17 @@ unordered_set optional unordered_set tuple unordered_set version utility compare -utility cstddef utility cstdint utility initializer_list utility limits utility version valarray cmath -valarray cstddef valarray cstdint valarray initializer_list valarray limits valarray new valarray version variant compare -variant cstddef variant cstdint variant cstring variant initializer_list diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index ee17223e66bee..944002f4974d0 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -1,7 +1,6 @@ algorithm cctype algorithm climits algorithm compare -algorithm cstddef algorithm cstdint algorithm cstring algorithm ctime @@ -15,7 +14,6 @@ algorithm optional algorithm ratio algorithm tuple algorithm version -any cstddef any cstdint any cstring any initializer_list @@ -25,7 +23,6 @@ any typeinfo any version array cctype array compare -array cstddef array cstdint array cwchar array cwctype @@ -35,7 +32,6 @@ array new array stdexcept array version atomic climits -atomic cstddef atomic cstdint atomic cstring atomic ctime @@ -43,7 +39,6 @@ atomic limits atomic ratio atomic version barrier climits -barrier cstddef barrier cstdint barrier cstring barrier ctime @@ -57,7 +52,6 @@ bit version bitset cctype bitset climits bitset compare -bitset cstddef bitset cstdint bitset cstdio bitset cstring @@ -105,7 +99,6 @@ ccomplex tuple ccomplex typeinfo ccomplex version charconv cerrno -charconv cstddef charconv cstdint charconv initializer_list charconv limits @@ -170,7 +163,6 @@ codecvt string_view codecvt tuple codecvt typeinfo codecvt version -compare cstddef compare cstdint compare limits compare version @@ -205,14 +197,12 @@ complex string_view complex tuple complex typeinfo complex version -concepts cstddef concepts version condition_variable atomic condition_variable cctype condition_variable cerrno condition_variable climits condition_variable compare -condition_variable cstddef condition_variable cstdint condition_variable cstdio condition_variable cstring @@ -231,7 +221,6 @@ condition_variable tuple condition_variable typeinfo condition_variable version coroutine compare -coroutine cstddef coroutine cstdint coroutine cstring coroutine limits @@ -274,7 +263,6 @@ cwchar cwctype cwctype cctype deque cctype deque compare -deque cstddef deque cstdint deque cstring deque cwchar @@ -285,14 +273,12 @@ deque new deque stdexcept deque tuple deque version -exception cstddef exception cstdint exception cstdlib exception new exception typeinfo exception version execution version -expected cstddef expected cstdint expected initializer_list expected new @@ -328,23 +314,18 @@ experimental/iterator tuple experimental/iterator typeinfo experimental/iterator variant experimental/iterator version -experimental/memory cstddef experimental/memory cstdint experimental/memory cstring experimental/memory version -experimental/propagate_const cstddef experimental/propagate_const version -experimental/simd cstddef experimental/simd cstdint experimental/simd limits experimental/simd version -experimental/type_traits cstddef experimental/type_traits cstdint experimental/type_traits initializer_list experimental/type_traits type_traits experimental/type_traits version experimental/utility compare -experimental/utility cstddef experimental/utility cstdint experimental/utility initializer_list experimental/utility limits @@ -432,7 +413,6 @@ format typeinfo format version forward_list cctype forward_list compare -forward_list cstddef forward_list cstdint forward_list cwchar forward_list cwctype @@ -480,7 +460,6 @@ fstream version functional array functional cctype functional compare -functional cstddef functional cstdint functional cstring functional cwchar @@ -528,7 +507,6 @@ future thread future tuple future typeinfo future version -initializer_list cstddef initializer_list version iomanip bitset iomanip cctype @@ -647,11 +625,14 @@ istream string_view istream tuple istream typeinfo istream version +iterator cctype iterator compare iterator concepts -iterator cstddef iterator cstdint +iterator cstdio iterator cstring +iterator cwchar +iterator cwctype iterator initializer_list iterator iosfwd iterator limits @@ -659,7 +640,6 @@ iterator new iterator variant iterator version latch climits -latch cstddef latch cstdint latch cstring latch ctime @@ -669,7 +649,6 @@ latch version limits version list cctype list compare -list cstddef list cstdint list cstring list cwchar @@ -707,7 +686,6 @@ locale typeinfo locale version map cctype map compare -map cstddef map cstdint map cstring map cwchar @@ -721,10 +699,8 @@ map tuple map version mdspan array mdspan cctype -mdspan cinttypes mdspan compare mdspan concepts -mdspan cstddef mdspan cstdint mdspan cwchar mdspan cwctype @@ -735,7 +711,6 @@ mdspan span mdspan stdexcept mdspan version memory compare -memory cstddef memory cstdint memory cstring memory initializer_list @@ -748,7 +723,6 @@ memory_resource cctype memory_resource cerrno memory_resource climits memory_resource compare -memory_resource cstddef memory_resource cstdint memory_resource cstdio memory_resource cstring @@ -771,7 +745,6 @@ mutex cctype mutex cerrno mutex climits mutex compare -mutex cstddef mutex cstdint mutex cstdio mutex cstring @@ -789,12 +762,10 @@ mutex string_view mutex tuple mutex typeinfo mutex version -new cstddef new version numbers version numeric climits numeric compare -numeric cstddef numeric cstdint numeric cstring numeric ctime @@ -806,7 +777,6 @@ numeric ratio numeric tuple numeric version optional compare -optional cstddef optional cstdint optional cstring optional initializer_list @@ -901,7 +871,6 @@ random cctype random climits random cmath random compare -random cstddef random cstdint random cstdio random cstring @@ -923,8 +892,8 @@ random version ranges cctype ranges compare ranges concepts -ranges cstddef ranges cstdint +ranges cstdio ranges cstring ranges cwchar ranges cwctype @@ -968,14 +937,12 @@ regex typeinfo regex vector regex version scoped_allocator compare -scoped_allocator cstddef scoped_allocator cstdint scoped_allocator limits scoped_allocator new scoped_allocator tuple scoped_allocator version semaphore climits -semaphore cstddef semaphore cstdint semaphore cstring semaphore ctime @@ -984,7 +951,6 @@ semaphore ratio semaphore version set cctype set compare -set cstddef set cstdint set cstring set cwchar @@ -999,7 +965,6 @@ shared_mutex cctype shared_mutex cerrno shared_mutex climits shared_mutex compare -shared_mutex cstddef shared_mutex cstdint shared_mutex cstdio shared_mutex cstring @@ -1018,7 +983,6 @@ shared_mutex tuple shared_mutex version source_location cstdint source_location version -span cstddef span initializer_list span limits span stdexcept @@ -1054,7 +1018,6 @@ sstream typeinfo sstream version stack cctype stack compare -stack cstddef stack cstdint stack cstring stack cwchar @@ -1068,7 +1031,6 @@ stack tuple stack version stop_token atomic stop_token climits -stop_token cstddef stop_token cstdint stop_token cstring stop_token ctime @@ -1103,7 +1065,6 @@ streambuf version string cctype string climits string compare -string cstddef string cstdint string cstdio string cstring @@ -1119,7 +1080,6 @@ string tuple string version string_view cctype string_view compare -string_view cstddef string_view cstdint string_view cstdio string_view cstring @@ -1207,7 +1167,6 @@ system_error cctype system_error cerrno system_error climits system_error compare -system_error cstddef system_error cstdint system_error cstdio system_error cstring @@ -1255,23 +1214,19 @@ thread tuple thread typeinfo thread version tuple compare -tuple cstddef tuple cstdint tuple limits tuple version type_traits cstdint type_traits version typeindex compare -typeindex cstddef typeindex cstdint typeindex limits typeindex typeinfo typeindex version -typeinfo cstddef typeinfo cstdint typeinfo version unordered_map compare -unordered_map cstddef unordered_map cstdint unordered_map cstring unordered_map initializer_list @@ -1282,7 +1237,6 @@ unordered_map stdexcept unordered_map tuple unordered_map version unordered_set compare -unordered_set cstddef unordered_set cstdint unordered_set cstring unordered_set initializer_list @@ -1292,20 +1246,17 @@ unordered_set optional unordered_set tuple unordered_set version utility compare -utility cstddef utility cstdint utility initializer_list utility limits utility version valarray cmath -valarray cstddef valarray cstdint valarray initializer_list valarray limits valarray new valarray version variant compare -variant cstddef variant cstdint variant cstring variant initializer_list diff --git a/libcxx/test/libcxx/utilities/template.bitset/includes.pass.cpp b/libcxx/test/libcxx/utilities/template.bitset/includes.pass.cpp index 42deaaa4b2d37..214b0eff8d9d2 100644 --- a/libcxx/test/libcxx/utilities/template.bitset/includes.pass.cpp +++ b/libcxx/test/libcxx/utilities/template.bitset/includes.pass.cpp @@ -6,16 +6,12 @@ // //===----------------------------------------------------------------------===// -// test that includes , , and +// test that includes , and #include #include "test_macros.h" -#ifndef _LIBCPP_CSTDDEF -#error has not been included -#endif - #ifndef _LIBCPP_STRING #error has not been included #endif diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp index 481d565961b2b..619dc7242a366 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp index 7654a4b0c7f00..7250c49a7ff95 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp index b6631add7e48a..6030bed47ec6a 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp index 2a51127a45914..56af9f234d075 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/libcxx/test/std/atomics/atomics.types.generic/address.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/address.pass.cpp index 0926628a2e9a8..dbe1841762f20 100644 --- a/libcxx/test/std/atomics/atomics.types.generic/address.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.generic/address.pass.cpp @@ -65,9 +65,10 @@ // }; #include +#include +#include #include #include -#include #include diff --git a/libcxx/test/std/concepts/concepts.callable/concept.invocable/invocable.compile.pass.cpp b/libcxx/test/std/concepts/concepts.callable/concept.invocable/invocable.compile.pass.cpp index 9b9bac27174e0..f9c8f645b284f 100644 --- a/libcxx/test/std/concepts/concepts.callable/concept.invocable/invocable.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.callable/concept.invocable/invocable.compile.pass.cpp @@ -11,8 +11,8 @@ // template // concept invocable; -#include #include +#include #include #include #include diff --git a/libcxx/test/std/concepts/concepts.callable/concept.regularinvocable/regular_invocable.compile.pass.cpp b/libcxx/test/std/concepts/concepts.callable/concept.regularinvocable/regular_invocable.compile.pass.cpp index bfd20751861d1..f3547a3ad97cb 100644 --- a/libcxx/test/std/concepts/concepts.callable/concept.regularinvocable/regular_invocable.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.callable/concept.regularinvocable/regular_invocable.compile.pass.cpp @@ -11,11 +11,10 @@ // template // concept regular_invocable; -#include #include +#include #include #include -#include #include template diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp index 126606ef7ab3c..ca0f40eb77d49 100644 --- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp @@ -14,11 +14,11 @@ #include #include +#include #include #include #include #include -#include #include #include #include diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp index e0edd1f332f81..0afbe582ba896 100644 --- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp @@ -14,15 +14,12 @@ #include #include +#include #include #include #include #include -#include #include -#include -#include -#include #include #include "test_macros.h" diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp index d95de10f35cd9..6f8324eaf7647 100644 --- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp @@ -14,11 +14,10 @@ #include #include +#include #include #include #include -#include -#include #include #include #include diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp index 0d7bd288c0a18..dffc33265aebf 100644 --- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp @@ -14,15 +14,12 @@ #include #include +#include #include #include #include #include -#include #include -#include -#include -#include #include #include "compare_types.h" diff --git a/libcxx/test/std/concepts/concepts.lang/concept.constructible/constructible_from.compile.pass.cpp b/libcxx/test/std/concepts/concepts.lang/concept.constructible/constructible_from.compile.pass.cpp index fe0ecece3382f..e3b58d622e450 100644 --- a/libcxx/test/std/concepts/concepts.lang/concept.constructible/constructible_from.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.lang/concept.constructible/constructible_from.compile.pass.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp index 6cbf1441b4647..a245131c7869d 100644 --- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/MinimalElementType.h b/libcxx/test/std/containers/views/mdspan/MinimalElementType.h index fe7f0e1f23837..1d1a2c3752102 100644 --- a/libcxx/test/std/containers/views/mdspan/MinimalElementType.h +++ b/libcxx/test/std/containers/views/mdspan/MinimalElementType.h @@ -9,6 +9,7 @@ #ifndef TEST_STD_CONTAINERS_VIEWS_MDSPAN_MINIMAL_ELEMENT_TYPE_H #define TEST_STD_CONTAINERS_VIEWS_MDSPAN_MINIMAL_ELEMENT_TYPE_H +#include #include #include diff --git a/libcxx/test/std/containers/views/mdspan/extents/CtorTestCombinations.h b/libcxx/test/std/containers/views/mdspan/extents/CtorTestCombinations.h index 18d4f4b61fb23..36f95704631f5 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/CtorTestCombinations.h +++ b/libcxx/test/std/containers/views/mdspan/extents/CtorTestCombinations.h @@ -9,9 +9,10 @@ // -#include -#include #include +#include +#include +#include #include #include "../ConvertibleToIntegral.h" diff --git a/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp index 574290ebec854..1d713044e60f7 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/comparison.pass.cpp @@ -17,11 +17,10 @@ // if lhs.extent(r) equals rhs.extent(r) for every rank index r of rhs, otherwise false. // -#include #include -#include +#include +#include #include // dynamic_extent -#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp index f6834b0b4133e..7baaa7ec9898e 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/conversion.pass.cpp @@ -28,15 +28,13 @@ // (((Extents != dynamic_extent) && (OtherExtents == dynamic_extent)) || ... ) || // (numeric_limits::max() < numeric_limits::max()) -#include #include -#include +#include #include +#include #include // dynamic_extent #include -#include "test_macros.h" - template constexpr void test_implicit_conversion(To dest, From src) { assert(dest == src); diff --git a/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp index 1a6501b391396..82f111153ce6a 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/ctad.pass.cpp @@ -16,8 +16,9 @@ // Remarks: The deduced type is dextents. // until C++26 // Remarks: The deduced type is extents...>. // since C++26 -#include #include +#include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp index 29dd9e2d27072..1d5f61d02e4ff 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/obs_static.pass.cpp @@ -26,8 +26,9 @@ // Returns: Di. // -#include #include +#include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp b/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp index 2924da91f77ee..10bc769473951 100644 --- a/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/extents/types.pass.cpp @@ -22,9 +22,10 @@ // ... // } -#include #include #include +#include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp index c8b4083291a68..151da5ba61740 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/comparison.pass.cpp @@ -15,11 +15,10 @@ // ` // Constraints: extents_type::rank() == OtherExtents::rank() is true. -#include #include -#include +#include +#include #include // dynamic_extent -#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp index 5a4040317d243..12c59a4caf10e 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.default.pass.cpp @@ -14,9 +14,10 @@ // // constexpr mapping() noexcept = default; -#include #include +#include #include +#include #include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp index 46505cb961bbd..299012dc5af84 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.extents.pass.cpp @@ -17,9 +17,10 @@ // // Effects: Direct-non-list-initializes extents_ with e. -#include #include +#include #include +#include #include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp index 5f9bd4344d0ec..1e8f8fb54d5ef 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_right.pass.cpp @@ -20,9 +20,10 @@ // // Preconditions: other.required_span_size() is representable as a value of type index_type -#include #include +#include #include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp index 34489b7c52d7d..1668c26a697d4 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.layout_stride.pass.cpp @@ -23,10 +23,10 @@ // // Effects: Direct-non-list-initializes extents_ with other.extents(). -#include #include #include -#include +#include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp index 63b3c50c73175..737e5f3d25728 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/ctor.mapping.pass.cpp @@ -18,9 +18,10 @@ // // Preconditions: other.required_span_size() is representable as a value of type index_type -#include #include +#include #include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp index 40cd6bc2812e3..84c3ef45c69b6 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/index_operator.pass.cpp @@ -23,11 +23,12 @@ // Preconditions: // * extents_type::index-cast(i) is a multidimensional index in extents_. -#include -#include #include +#include #include +#include #include // dynamic_extent +#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp index 19f523824cfc4..32442ecd5a0e6 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/properties.pass.cpp @@ -26,11 +26,10 @@ // }; // } -#include #include -#include +#include +#include #include // dynamic_extent -#include #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp index 4cb111d29827a..9ad61b0799c15 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/required_span_size.pass.cpp @@ -14,9 +14,10 @@ // // Returns: extents().fwd-prod-of-extents(extents_type::rank()). -#include #include +#include #include +#include #include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp index 7a6add60efcd1..6410fecdab59e 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_left/static_requirements.pass.cpp @@ -76,9 +76,9 @@ // Result: A constant expression ([expr.const]) of type bool. // Returns: true only if m.is_strided() is true for all possible objects m of type M. -#include #include -#include +#include +#include #include // dynamic_extent #include #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp index 03c78ca5e91d9..a65d7d39db8e2 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/comparison.pass.cpp @@ -15,11 +15,10 @@ // ` // Constraints: extents_type::rank() == OtherExtents::rank() is true. -#include #include -#include +#include +#include #include // dynamic_extent -#include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp index f02174416f33c..d644b0ff18d82 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.default.pass.cpp @@ -14,9 +14,10 @@ // // constexpr mapping() noexcept = default; -#include #include +#include #include +#include #include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp index 9c2c39bc3cb3a..cd0cff838fac5 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.extents.pass.cpp @@ -17,9 +17,10 @@ // // Effects: Direct-non-list-initializes extents_ with e. -#include #include +#include #include +#include #include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp index 61aba5dae6829..994d98a803211 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_left.pass.cpp @@ -20,9 +20,10 @@ // // Preconditions: other.required_span_size() is representable as a value of type index_type -#include #include +#include #include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp index 3bc7d82f8ed8d..89321f860dc3a 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.layout_stride.pass.cpp @@ -23,10 +23,10 @@ // // Effects: Direct-non-list-initializes extents_ with other.extents(). -#include #include #include -#include +#include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp index eeea5ab021e97..ee00c688301ef 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/ctor.mapping.pass.cpp @@ -18,9 +18,10 @@ // // Preconditions: other.required_span_size() is representable as a value of type index_type -#include #include +#include #include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp index 989078f17d303..42d4e9a2d24b6 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/index_operator.pass.cpp @@ -23,9 +23,10 @@ // Preconditions: // * extents_type::index-cast(i) is a multidimensional index in extents_. -#include #include +#include #include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp index 674a7ac98cbf7..857ec3cb05835 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/properties.pass.cpp @@ -26,11 +26,10 @@ // }; // } -#include #include -#include +#include +#include #include // dynamic_extent -#include #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp index 0128d8c26a83e..2ffd1f41f9638 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/required_span_size.pass.cpp @@ -14,10 +14,10 @@ // // Returns: extents().fwd-prod-of-extents(extents_type::rank()). - -#include #include +#include #include +#include #include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp index 2b11d17c6717a..b7e01d14532d4 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_right/static_requirements.pass.cpp @@ -76,9 +76,9 @@ // Result: A constant expression ([expr.const]) of type bool. // Returns: true only if m.is_strided() is true for all possible objects m of type M. -#include #include -#include +#include +#include #include // dynamic_extent #include #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp index 108c4c6fca98e..055986d80c726 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.default.pass.cpp @@ -20,9 +20,10 @@ // Effects: Direct-non-list-initializes extents_ with extents_type(), and for all d in the range [0, rank_), // direct-non-list-initializes strides_[d] with layout_right::mapping().stride(d). -#include #include +#include #include +#include #include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp index cecfb79ea6867..bbda6d4a73631 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_array.pass.cpp @@ -27,10 +27,11 @@ // Effects: Direct-non-list-initializes extents_ with e, and for all d in the range [0, rank_), // direct-non-list-initializes strides_[d] with as_const(s[d]). -#include #include #include +#include #include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp index d0f26ad23df98..f5db6768f2c0e 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/ctor.extents_span.pass.cpp @@ -27,10 +27,11 @@ // Effects: Direct-non-list-initializes extents_ with e, and for all d in the range [0, rank_), // direct-non-list-initializes strides_[d] with as_const(s[d]). -#include #include #include +#include #include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp index ca88a9f8e0440..233eebff0ccd0 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/deduction.pass.cpp @@ -10,13 +10,12 @@ // -#include #include #include -#include +#include #include +#include #include // dynamic_extent -#include #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp index 589e32f86e39d..9f51cc01cf9df 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/is_exhaustive_corner_case.pass.cpp @@ -19,14 +19,11 @@ // range [1, rank_), where pi is the ith element of P. // - Otherwise, false. -#include #include #include -#include +#include +#include #include // dynamic_extent -#include - -#include "test_macros.h" template constexpr void diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp index b1eb84b375b6e..eac1029882e3a 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/properties.pass.cpp @@ -38,10 +38,11 @@ // range [1, rank_), where pi is the ith element of P. // - Otherwise, false. -#include #include #include #include +#include +#include #include // dynamic_extent #include diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp index 870518994a939..629849646bb22 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/required_span_size.pass.cpp @@ -19,10 +19,11 @@ // // Returns: REQUIRED-SPAN-SIZE(extents(), strides_). -#include #include #include +#include #include +#include #include // dynamic_extent #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp index a69fb4f287c3e..8131ecde5d769 100644 --- a/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp +++ b/libcxx/test/std/containers/views/mdspan/layout_stride/static_requirements.pass.cpp @@ -76,9 +76,9 @@ // Result: A constant expression ([expr.const]) of type bool. // Returns: true only if m.is_strided() is true for all possible objects m of type M. -#include #include -#include +#include +#include #include // dynamic_extent #include #include diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/CustomTestAccessors.h b/libcxx/test/std/containers/views/mdspan/mdspan/CustomTestAccessors.h index b68268d172a1f..0795926cb43ff 100644 --- a/libcxx/test/std/containers/views/mdspan/mdspan/CustomTestAccessors.h +++ b/libcxx/test/std/containers/views/mdspan/mdspan/CustomTestAccessors.h @@ -17,9 +17,10 @@ #ifndef TEST_STD_CONTAINERS_VIEWS_MDSPAN_MDSPAN_CUSTOM_TEST_ACCESSORS_H #define TEST_STD_CONTAINERS_VIEWS_MDSPAN_MDSPAN_CUSTOM_TEST_ACCESSORS_H +#include +#include #include #include -#include // This contains a bunch of accessors and handles which have different properties // regarding constructibility and convertibility in order to test mdspan constraints diff --git a/libcxx/test/std/containers/views/views.span/span.cons/iterator_len.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/iterator_len.pass.cpp index fbbd3d6ff4044..fcc2625ee6596 100644 --- a/libcxx/test/std/containers/views/views.span/span.cons/iterator_len.pass.cpp +++ b/libcxx/test/std/containers/views/views.span/span.cons/iterator_len.pass.cpp @@ -14,10 +14,10 @@ // If Extent is not equal to dynamic_extent, then count shall be equal to Extent. // - -#include #include +#include #include +#include #include template diff --git a/libcxx/test/std/containers/views/views.span/span.cons/iterator_sentinel.verify.cpp b/libcxx/test/std/containers/views/views.span/span.cons/iterator_sentinel.verify.cpp index a31aa2af7b9dd..937d8d921ea0b 100644 --- a/libcxx/test/std/containers/views/views.span/span.cons/iterator_sentinel.verify.cpp +++ b/libcxx/test/std/containers/views/views.span/span.cons/iterator_sentinel.verify.cpp @@ -15,8 +15,9 @@ // If Extent is not equal to dynamic_extent, then last - first shall be equal to Extent. // -#include +#include #include +#include template std::span createImplicitSpan(T* first, T* last) { diff --git a/libcxx/test/std/containers/views/views.span/span.objectrep/as_bytes.pass.cpp b/libcxx/test/std/containers/views/views.span/span.objectrep/as_bytes.pass.cpp index 1f58d0f969f70..44b658fb83759 100644 --- a/libcxx/test/std/containers/views/views.span/span.objectrep/as_bytes.pass.cpp +++ b/libcxx/test/std/containers/views/views.span/span.objectrep/as_bytes.pass.cpp @@ -16,9 +16,9 @@ // : sizeof(ElementType) * Extent> // as_bytes(span s) noexcept; - -#include #include +#include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/containers/views/views.span/span.objectrep/as_writable_bytes.pass.cpp b/libcxx/test/std/containers/views/views.span/span.objectrep/as_writable_bytes.pass.cpp index 6b7bd5dcf0c1e..d38d69d9fee1d 100644 --- a/libcxx/test/std/containers/views/views.span/span.objectrep/as_writable_bytes.pass.cpp +++ b/libcxx/test/std/containers/views/views.span/span.objectrep/as_writable_bytes.pass.cpp @@ -16,9 +16,9 @@ // : sizeof(ElementType) * Extent> // as_writable_bytes(span s) noexcept; - -#include #include +#include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/diagnostics/syserr/is_error_code_enum.pass.cpp b/libcxx/test/std/diagnostics/syserr/is_error_code_enum.pass.cpp index 3f614efee2036..437d0f0a91177 100644 --- a/libcxx/test/std/diagnostics/syserr/is_error_code_enum.pass.cpp +++ b/libcxx/test/std/diagnostics/syserr/is_error_code_enum.pass.cpp @@ -12,8 +12,10 @@ // template <> struct is_error_code_enum<> : public false_type {}; -#include +#include #include +#include + #include "test_macros.h" template diff --git a/libcxx/test/std/diagnostics/syserr/is_error_condition_enum.pass.cpp b/libcxx/test/std/diagnostics/syserr/is_error_condition_enum.pass.cpp index e9916f2427a55..f4d5057948acd 100644 --- a/libcxx/test/std/diagnostics/syserr/is_error_condition_enum.pass.cpp +++ b/libcxx/test/std/diagnostics/syserr/is_error_condition_enum.pass.cpp @@ -12,9 +12,11 @@ // template constexpr bool is_error_condition_enum_v; +#include #include #include #include + #include "test_macros.h" template diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_ctor_broadcast.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_broadcast.pass.cpp index 8a291632a8ab1..fbdaa438f7886 100644 --- a/libcxx/test/std/experimental/simd/simd.class/simd_ctor_broadcast.pass.cpp +++ b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_broadcast.pass.cpp @@ -14,6 +14,9 @@ // [simd.class] // template simd(U&& value) noexcept; +#include +#include + #include "../test_utils.h" namespace ex = std::experimental::parallelism_v2; diff --git a/libcxx/test/std/experimental/simd/test_utils.h b/libcxx/test/std/experimental/simd/test_utils.h index 3c227a43c2f4d..4c7b459cc8eb1 100644 --- a/libcxx/test/std/experimental/simd/test_utils.h +++ b/libcxx/test/std/experimental/simd/test_utils.h @@ -9,12 +9,13 @@ #ifndef LIBCXX_TEST_STD_EXPERIMENTAL_SIMD_TEST_UTILS_H #define LIBCXX_TEST_STD_EXPERIMENTAL_SIMD_TEST_UTILS_H -#include #include #include +#include +#include #include #include -#include + #include "type_algorithms.h" namespace ex = std::experimental::parallelism_v2; diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/incrementable_traits.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/incrementable_traits.compile.pass.cpp index 8413f912e5762..6d07e973ec306 100644 --- a/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/incrementable_traits.compile.pass.cpp +++ b/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/incrementable_traits.compile.pass.cpp @@ -11,12 +11,9 @@ // template // struct incrementable_traits; -#include - #include #include - -#include "test_macros.h" +#include template concept check_has_difference_type = requires { diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/iter_difference_t.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/iter_difference_t.compile.pass.cpp index bd6664fe957a2..99512f7c3ba1c 100644 --- a/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/iter_difference_t.compile.pass.cpp +++ b/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/incrementable.traits/iter_difference_t.compile.pass.cpp @@ -14,6 +14,7 @@ #include #include +#include #include template diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/readable.traits/indirectly_readable_traits.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/readable.traits/indirectly_readable_traits.compile.pass.cpp index 835aa9c7be278..4fd935e460ca8 100644 --- a/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/readable.traits/indirectly_readable_traits.compile.pass.cpp +++ b/libcxx/test/std/iterators/iterator.requirements/iterator.assoc.types/readable.traits/indirectly_readable_traits.compile.pass.cpp @@ -14,9 +14,9 @@ #include #include +#include #include #include -#include #include template diff --git a/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/types.pass.cpp b/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/types.pass.cpp index f8d3e2b4fdc7b..68edbb1cb91de 100644 --- a/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/types.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/types.pass.cpp @@ -27,6 +27,7 @@ // typedef void pointer; // }; +#include #include #include #include diff --git a/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/types.pass.cpp b/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/types.pass.cpp index f71ba368ab861..c74f9704d0c35 100644 --- a/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/types.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/types.pass.cpp @@ -28,6 +28,7 @@ // typedef output_iterator_tag iterator_category; // }; +#include #include #include #include diff --git a/libcxx/test/std/iterators/predef.iterators/insert.iterators/insert.iterator/types.pass.cpp b/libcxx/test/std/iterators/predef.iterators/insert.iterators/insert.iterator/types.pass.cpp index 08864868342ae..faf5ca5d61836 100644 --- a/libcxx/test/std/iterators/predef.iterators/insert.iterators/insert.iterator/types.pass.cpp +++ b/libcxx/test/std/iterators/predef.iterators/insert.iterators/insert.iterator/types.pass.cpp @@ -28,9 +28,11 @@ // typedef void pointer; // }; +#include #include #include #include + #include "test_macros.h" template diff --git a/libcxx/test/std/numerics/bit/byteswap.pass.cpp b/libcxx/test/std/numerics/bit/byteswap.pass.cpp index b87faf150177f..9d4e328ed9d0f 100644 --- a/libcxx/test/std/numerics/bit/byteswap.pass.cpp +++ b/libcxx/test/std/numerics/bit/byteswap.pass.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include diff --git a/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan.pass.cpp b/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan.pass.cpp index bef2dd7866964..f6cac33e94438 100644 --- a/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan.pass.cpp @@ -16,12 +16,11 @@ // OutputIterator result, T init); // -#include #include #include #include -#include -#include +#include +#include #include "test_macros.h" #include "test_iterators.h" diff --git a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan.pass.cpp b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan.pass.cpp index 299d085d01a06..1c79eebb8ee2c 100644 --- a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan.pass.cpp @@ -16,12 +16,11 @@ // OutputIterator result, T init); // -#include #include #include #include -#include -#include +#include +#include #include "test_macros.h" #include "test_iterators.h" diff --git a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op.pass.cpp b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op.pass.cpp index 87bea923eed3f..fb39b4f06ece7 100644 --- a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op.pass.cpp @@ -17,12 +17,12 @@ // OutputIterator result, // BinaryOperation binary_op); // C++17 -#include #include #include #include +#include #include -#include +#include #include "test_macros.h" #include "test_iterators.h" diff --git a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op_init.pass.cpp b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op_init.pass.cpp index 2e21d38f47344..4e07306c29311 100644 --- a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op_init.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op_init.pass.cpp @@ -17,12 +17,12 @@ // OutputIterator result, // BinaryOperation binary_op, T init); // C++17 -#include #include #include #include +#include #include -#include +#include #include "test_macros.h" #include "test_iterators.h" diff --git a/libcxx/test/std/numerics/numeric.ops/transform.exclusive.scan/transform_exclusive_scan_init_bop_uop.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.exclusive.scan/transform_exclusive_scan_init_bop_uop.pass.cpp index 52272205307a9..7e1017d4e3040 100644 --- a/libcxx/test/std/numerics/numeric.ops/transform.exclusive.scan/transform_exclusive_scan_init_bop_uop.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/transform.exclusive.scan/transform_exclusive_scan_init_bop_uop.pass.cpp @@ -18,13 +18,12 @@ // BinaryOperation binary_op, // UnaryOperation unary_op); - -#include #include #include #include +#include #include -#include +#include #include "test_macros.h" #include "test_iterators.h" diff --git a/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop.pass.cpp index 80ead01e9a795..1dd7661bb42ed 100644 --- a/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop.pass.cpp @@ -18,13 +18,12 @@ // BinaryOperation binary_op, // UnaryOperation unary_op); - -#include #include #include #include +#include #include -#include +#include #include "test_macros.h" #include "test_iterators.h" diff --git a/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop_init.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop_init.pass.cpp index 18be676c7a54d..1269c3f68236b 100644 --- a/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop_init.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop_init.pass.cpp @@ -19,13 +19,12 @@ // UnaryOperation unary_op, // T init); - -#include #include #include #include +#include #include -#include +#include #include "test_macros.h" #include "test_iterators.h" diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval.pass.cpp index ec7f72edf9ee9..c2d280406852a 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval.pass.cpp @@ -15,13 +15,12 @@ // template result_type operator()(_URNG& g); -#include #include #include #include -#include +#include #include -#include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval_param.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval_param.pass.cpp index 9bcb2ed3afac1..e31b4c5837fe9 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval_param.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/eval_param.pass.cpp @@ -15,13 +15,12 @@ // template result_type operator()(_URNG& g, const param_type& parm); -#include #include // for sort #include #include -#include +#include #include -#include +#include #include #include "test_macros.h" diff --git a/libcxx/test/std/ranges/range.adaptors/range.chunk.by/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.chunk.by/ctor.default.pass.cpp index 98c6cb7af5f56..96d96053f74ba 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.chunk.by/ctor.default.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.chunk.by/ctor.default.pass.cpp @@ -16,6 +16,7 @@ #include #include +#include #include constexpr int buff[] = {-2, 1, -1, 2}; diff --git a/libcxx/test/std/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/deref.pass.cpp index 8cc9bdac0b1d8..6a5ce1335eccf 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/deref.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/deref.pass.cpp @@ -17,13 +17,12 @@ #include #include #include +#include #include #include -#include #include "../types.h" #include "test_iterators.h" -#include "test_macros.h" template > constexpr void test() { diff --git a/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/member_typedefs.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/member_typedefs.compile.pass.cpp index 2ef09a6d92169..d7202e35e66d3 100644 --- a/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/member_typedefs.compile.pass.cpp +++ b/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/member_typedefs.compile.pass.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include diff --git a/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/minus.pass.cpp b/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/minus.pass.cpp index 3a373741f4c0e..523eefa69b4ca 100644 --- a/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/minus.pass.cpp +++ b/libcxx/test/std/ranges/range.factories/range.repeat.view/iterator/minus.pass.cpp @@ -11,10 +11,10 @@ // friend constexpr iterator operator-(iterator i, difference_type n); // friend constexpr difference_type operator-(const iterator& x, const iterator& y); -#include #include -#include #include +#include +#include constexpr bool test() { // - difference_type diff --git a/libcxx/test/std/ranges/range.utility/range.subrange/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.utility/range.subrange/ctad.compile.pass.cpp index 670f9808b902c..2bf6ca6fb0a0c 100644 --- a/libcxx/test/std/ranges/range.utility/range.subrange/ctad.compile.pass.cpp +++ b/libcxx/test/std/ranges/range.utility/range.subrange/ctad.compile.pass.cpp @@ -13,7 +13,8 @@ #include #include -#include "test_macros.h" +#include + #include "test_iterators.h" using FI = forward_iterator; diff --git a/libcxx/test/std/strings/string.view/string.view.modifiers/remove_prefix.pass.cpp b/libcxx/test/std/strings/string.view/string.view.modifiers/remove_prefix.pass.cpp index 4bfa0f38829dd..26db908428c32 100644 --- a/libcxx/test/std/strings/string.view/string.view.modifiers/remove_prefix.pass.cpp +++ b/libcxx/test/std/strings/string.view/string.view.modifiers/remove_prefix.pass.cpp @@ -12,8 +12,9 @@ // void remove_prefix(size_type _n) -#include #include +#include +#include #include "test_macros.h" diff --git a/libcxx/test/std/strings/string.view/string.view.modifiers/remove_suffix.pass.cpp b/libcxx/test/std/strings/string.view/string.view.modifiers/remove_suffix.pass.cpp index 6d57f6c24a487..b6edccc68c9e8 100644 --- a/libcxx/test/std/strings/string.view/string.view.modifiers/remove_suffix.pass.cpp +++ b/libcxx/test/std/strings/string.view/string.view.modifiers/remove_suffix.pass.cpp @@ -12,8 +12,9 @@ // void remove_suffix(size_type _n) -#include #include +#include +#include #include "test_macros.h" diff --git a/libcxx/test/std/strings/string.view/string.view.modifiers/swap.pass.cpp b/libcxx/test/std/strings/string.view/string.view.modifiers/swap.pass.cpp index de0c5f423c4e8..d18a7b28688dc 100644 --- a/libcxx/test/std/strings/string.view/string.view.modifiers/swap.pass.cpp +++ b/libcxx/test/std/strings/string.view/string.view.modifiers/swap.pass.cpp @@ -12,8 +12,9 @@ // void swap(basic_string_view& _other) noexcept -#include #include +#include +#include #include "test_macros.h" diff --git a/libcxx/test/std/strings/string.view/string.view.ops/copy.pass.cpp b/libcxx/test/std/strings/string.view/string.view.ops/copy.pass.cpp index 1d97723b90edc..0b27a05056c68 100644 --- a/libcxx/test/std/strings/string.view/string.view.ops/copy.pass.cpp +++ b/libcxx/test/std/strings/string.view/string.view.ops/copy.pass.cpp @@ -18,10 +18,11 @@ // Effects: Equivalent to std::copy_n(begin() + pos, rlen, s). // Returns: rlen. -#include #include #include +#include #include +#include #include "test_macros.h" diff --git a/libcxx/test/std/strings/string.view/string.view.ops/substr.pass.cpp b/libcxx/test/std/strings/string.view/string.view.ops/substr.pass.cpp index 1c8e0aebabd41..62b0259c175f8 100644 --- a/libcxx/test/std/strings/string.view/string.view.ops/substr.pass.cpp +++ b/libcxx/test/std/strings/string.view/string.view.ops/substr.pass.cpp @@ -16,10 +16,11 @@ // Effects: Determines the effective length rlen of the string to reference as the smaller of n and size() - pos. // Returns: basic_string_view(data()+pos, rlen). -#include #include -#include #include +#include +#include +#include #include "test_macros.h" diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/hash.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/hash.pass.cpp index 1704eb67562e1..b9209ac2c89b6 100644 --- a/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/hash.pass.cpp +++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/hash.pass.cpp @@ -33,9 +33,9 @@ #include #include +#include #include -#include "test_macros.h" #include "test_iterators.h" template struct MyHash { diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/hash.pred.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/hash.pred.pass.cpp index 3648be9bb06ac..d616301767ede 100644 --- a/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/hash.pred.pass.cpp +++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/hash.pred.pass.cpp @@ -33,9 +33,9 @@ #include #include +#include #include -#include "test_macros.h" #include "test_iterators.h" template struct MyHash { diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/hash.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/hash.pass.cpp index d0e6d5f8bfb6b..19f5710819c46 100644 --- a/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/hash.pass.cpp +++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/hash.pass.cpp @@ -31,10 +31,10 @@ // }; #include -#include #include +#include +#include -#include "test_macros.h" #include "test_iterators.h" template struct MyHash { diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/hash.pred.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/hash.pred.pass.cpp index 59b5b30d7f1b5..3d29a05efd40c 100644 --- a/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/hash.pred.pass.cpp +++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/hash.pred.pass.cpp @@ -31,10 +31,10 @@ // }; #include -#include #include +#include +#include -#include "test_macros.h" #include "test_iterators.h" template struct MyHash { diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp index 381bcda761700..064f7acdf71a0 100644 --- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp +++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp @@ -13,8 +13,8 @@ // UNSUPPORTED: c++03, c++11, c++14 +#include #include -#include #include #include "test_macros.h" @@ -153,4 +153,3 @@ struct invalid_c_vararg { R operator()(int, ...) { return {}; } }; static_assert(!can_deduce); static_assert(!can_deduce); static_assert(!can_deduce); - diff --git a/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp b/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp index 577a9bf0e15ba..448c5ba143c10 100644 --- a/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp +++ b/libcxx/test/std/utilities/function.objects/unord.hash/pointer.pass.cpp @@ -19,10 +19,10 @@ // Not very portable -#include #include +#include +#include #include -#include #include "test_macros.h" diff --git a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp index 272441ebedc2f..92e8879d05bf0 100644 --- a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp +++ b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp @@ -13,8 +13,9 @@ // template // constexpr T* construct_at(T* location, Args&& ...args); -#include #include +#include +#include #include #include "test_iterators.h" diff --git a/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp b/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp index 4c66370fac922..3fa9355270866 100644 --- a/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp +++ b/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp @@ -22,12 +22,11 @@ // return_temporary_buffer(T* p); #include +#include #include #include #include -#include "test_macros.h" - struct alignas(32) A { int field; }; diff --git a/libcxx/test/std/utilities/memory/temporary.buffer/temporary_buffer.pass.cpp b/libcxx/test/std/utilities/memory/temporary.buffer/temporary_buffer.pass.cpp index 5f7fc4571906c..39a4767d874e3 100644 --- a/libcxx/test/std/utilities/memory/temporary.buffer/temporary_buffer.pass.cpp +++ b/libcxx/test/std/utilities/memory/temporary.buffer/temporary_buffer.pass.cpp @@ -19,12 +19,11 @@ // void // return_temporary_buffer(T* p); -#include #include +#include +#include #include -#include "test_macros.h" - int main(int, char**) { std::pair ip = std::get_temporary_buffer(5); diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp index 32db9d12099f2..2eecf7925d739 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_invocable.pass.cpp @@ -18,13 +18,12 @@ // Fn and all types in the template parameter pack ArgTypes shall be // complete types, cv void, or arrays of unknown bound. -#include +#include #include #include +#include #include -#include "test_macros.h" - struct Tag {}; struct DerFromTag : Tag {}; diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp index fa6048e869e18..47e81f38e54b3 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp @@ -12,12 +12,10 @@ // is_nothrow_invocable +#include #include -#include #include -#include "test_macros.h" - struct Tag {}; struct Implicit { diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_overaligned_request.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_overaligned_request.pass.cpp index cf3dfe21d9675..ad967798747ab 100644 --- a/libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_overaligned_request.pass.cpp +++ b/libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_overaligned_request.pass.cpp @@ -14,8 +14,9 @@ // class monotonic_buffer_resource -#include #include +#include +#include #include "test_macros.h" #include "count_new.h" diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_allocate_overaligned_request.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_allocate_overaligned_request.pass.cpp index 35a6a8f135a40..f3e1e4c026ef7 100644 --- a/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_allocate_overaligned_request.pass.cpp +++ b/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_allocate_overaligned_request.pass.cpp @@ -14,9 +14,10 @@ // class synchronized_pool_resource -#include #include +#include #include // std::align +#include #include "count_new.h" #include "test_macros.h" diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_deallocate_matches_allocate.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_deallocate_matches_allocate.pass.cpp index d5b3b6e08a42a..f55ab1d1aa5c1 100644 --- a/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_deallocate_matches_allocate.pass.cpp +++ b/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_deallocate_matches_allocate.pass.cpp @@ -15,9 +15,10 @@ // class synchronized_pool_resource -#include #include #include +#include +#include #include #include diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_allocate_overaligned_request.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_allocate_overaligned_request.pass.cpp index cd8d8a7745925..5153a54ef3f14 100644 --- a/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_allocate_overaligned_request.pass.cpp +++ b/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_allocate_overaligned_request.pass.cpp @@ -14,9 +14,10 @@ // class unsynchronized_pool_resource -#include #include +#include #include // std::align +#include #include "count_new.h" #include "test_macros.h" diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_deallocate_matches_allocate.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_deallocate_matches_allocate.pass.cpp index fe5f4736a7b1a..a0e86f82e9b2d 100644 --- a/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_deallocate_matches_allocate.pass.cpp +++ b/libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_deallocate_matches_allocate.pass.cpp @@ -15,9 +15,10 @@ // class unsynchronized_pool_resource -#include #include #include +#include +#include #include #include diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py index 41524e8fe7186..59dd707ae6126 100755 --- a/libcxx/utils/generate_escaped_output_table.py +++ b/libcxx/utils/generate_escaped_output_table.py @@ -218,7 +218,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: #include <__algorithm/ranges_upper_bound.h> #include <__config> -#include +#include <__cstddef/ptrdiff_t.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/utils/generate_extended_grapheme_cluster_table.py b/libcxx/utils/generate_extended_grapheme_cluster_table.py index 558b606186130..eba88a4f48776 100755 --- a/libcxx/utils/generate_extended_grapheme_cluster_table.py +++ b/libcxx/utils/generate_extended_grapheme_cluster_table.py @@ -214,8 +214,8 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: #include <__algorithm/ranges_upper_bound.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/access.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/utils/generate_indic_conjunct_break_table.py b/libcxx/utils/generate_indic_conjunct_break_table.py index e41f6e9be233d..580d8157ffebf 100755 --- a/libcxx/utils/generate_indic_conjunct_break_table.py +++ b/libcxx/utils/generate_indic_conjunct_break_table.py @@ -207,8 +207,8 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: #include <__algorithm/ranges_upper_bound.h> #include <__config> +#include <__cstddef/ptrdiff_t.h> #include <__iterator/access.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/utils/generate_width_estimation_table.py b/libcxx/utils/generate_width_estimation_table.py index d8c036f34e835..f81f0ba77489e 100644 --- a/libcxx/utils/generate_width_estimation_table.py +++ b/libcxx/utils/generate_width_estimation_table.py @@ -246,7 +246,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]: #include <__algorithm/ranges_upper_bound.h> #include <__config> -#include +#include <__cstddef/ptrdiff_t.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxxabi/src/demangle/cp-to-llvm.sh b/libcxxabi/src/demangle/cp-to-llvm.sh index cfe32c228e65d..f8b3585a5fa37 100755 --- a/libcxxabi/src/demangle/cp-to-llvm.sh +++ b/libcxxabi/src/demangle/cp-to-llvm.sh @@ -18,7 +18,7 @@ if [[ ! -d "$LLVM_DEMANGLE_DIR" ]]; then exit 1 fi -read -p "This will overwrite the copies of $FILES in $LLVM_DEMANGLE_DIR; are you sure? [y/N]" -n 1 -r ANSWER +read -p "This will overwrite the copies of $HDRS in $LLVM_DEMANGLE_DIR; are you sure? [y/N]" -n 1 -r ANSWER echo if [[ $ANSWER =~ ^[Yy]$ ]]; then diff --git a/libcxxabi/src/private_typeinfo.cpp b/libcxxabi/src/private_typeinfo.cpp index 9dba91e1985e3..2f631041f74c9 100644 --- a/libcxxabi/src/private_typeinfo.cpp +++ b/libcxxabi/src/private_typeinfo.cpp @@ -41,9 +41,11 @@ // Defining _LIBCXXABI_FORGIVING_DYNAMIC_CAST does not help since can_catch() calls // is_equal() with use_strcmp=false so the string names are not compared. -#include #include +#include +#include #include + #include "abort_message.h" #ifdef _LIBCXXABI_FORGIVING_DYNAMIC_CAST diff --git a/libcxxabi/test/test_aux_runtime.pass.cpp b/libcxxabi/test/test_aux_runtime.pass.cpp index dde553864e38e..499382c782acd 100644 --- a/libcxxabi/test/test_aux_runtime.pass.cpp +++ b/libcxxabi/test/test_aux_runtime.pass.cpp @@ -25,9 +25,13 @@ bool bad_typeid_test () { class A { virtual void f() {}}; class B { virtual void g() {}}; - B *bp = NULL; - try {bool b = typeid(*bp) == typeid (A); ((void)b); } - catch ( const std::bad_typeid &) { return true; } + B* bp = nullptr; + try { + bool b = typeid(*bp) == typeid(A); + ((void)b); + } catch (const std::bad_typeid&) { + return true; + } return false; } diff --git a/libunwind/docs/index.rst b/libunwind/docs/index.rst index e1283e7acfcc2..0354246401146 100644 --- a/libunwind/docs/index.rst +++ b/libunwind/docs/index.rst @@ -78,7 +78,7 @@ and `Getting started with LLVM `__. If you think you've found a bug in libunwind, please report it using the `LLVM bug tracker`_. If you're not sure, you -can ask for support on the `Runtimes forum`_ or on IRC. +can ask for support on the `Runtimes forum`_ or on Discord. Please use the tag "libunwind" for new threads. **Patches** diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 08c1476a595f6..d717afac47389 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -415,7 +415,7 @@ void LinkerDriver::parseDirectives(InputFile *file) { case OPT_entry: if (!arg->getValue()[0]) fatal("missing entry point symbol name"); - ctx.config.entry = addUndefined(mangle(arg->getValue())); + ctx.config.entry = addUndefined(mangle(arg->getValue()), true); break; case OPT_failifmismatch: checkFailIfMismatch(arg->getValue(), file); @@ -696,12 +696,33 @@ void LinkerDriver::addLibSearchPaths() { } } -Symbol *LinkerDriver::addUndefined(StringRef name) { +Symbol *LinkerDriver::addUndefined(StringRef name, bool aliasEC) { Symbol *b = ctx.symtab.addUndefined(name); if (!b->isGCRoot) { b->isGCRoot = true; ctx.config.gcroot.push_back(b); } + + // On ARM64EC, a symbol may be defined in either its mangled or demangled form + // (or both). Define an anti-dependency symbol that binds both forms, similar + // to how compiler-generated code references external functions. + if (aliasEC && isArm64EC(ctx.config.machine)) { + if (std::optional mangledName = + getArm64ECMangledFunctionName(name)) { + auto u = dyn_cast(b); + if (u && !u->weakAlias) { + Symbol *t = ctx.symtab.addUndefined(saver().save(*mangledName)); + u->setWeakAlias(t, true); + } + } else { + std::optional demangledName = + getArm64ECDemangledFunctionName(name); + Symbol *us = ctx.symtab.addUndefined(saver().save(*demangledName)); + auto u = dyn_cast(us); + if (u && !u->weakAlias) + u->setWeakAlias(b, true); + } + } return b; } @@ -2342,22 +2363,22 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (auto *arg = args.getLastArg(OPT_entry)) { if (!arg->getValue()[0]) fatal("missing entry point symbol name"); - config->entry = addUndefined(mangle(arg->getValue())); + config->entry = addUndefined(mangle(arg->getValue()), true); } else if (!config->entry && !config->noEntry) { if (args.hasArg(OPT_dll)) { StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12" : "_DllMainCRTStartup"; - config->entry = addUndefined(s); + config->entry = addUndefined(s, true); } else if (config->driverWdm) { // /driver:wdm implies /entry:_NtProcessStartup - config->entry = addUndefined(mangle("_NtProcessStartup")); + config->entry = addUndefined(mangle("_NtProcessStartup"), true); } else { // Windows specific -- If entry point name is not given, we need to // infer that from user-defined entry name. StringRef s = findDefaultEntry(); if (s.empty()) fatal("entry point must be defined"); - config->entry = addUndefined(s); + config->entry = addUndefined(s, true); log("Entry name inferred: " + s); } } @@ -2371,7 +2392,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (config->machine == I386) { config->delayLoadHelper = addUndefined("___delayLoadHelper2@8"); } else { - config->delayLoadHelper = addUndefined("__delayLoadHelper2"); + config->delayLoadHelper = addUndefined("__delayLoadHelper2", true); } } } @@ -2505,7 +2526,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { for (Export &e : config->exports) { if (!e.forwardTo.empty()) continue; - e.sym = addUndefined(e.name); + e.sym = addUndefined(e.name, !e.data); if (e.source != ExportSource::Directives) e.symbolName = mangleMaybe(e.sym); } diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h index 58a2ed2310624..3889feb7511c0 100644 --- a/lld/COFF/Driver.h +++ b/lld/COFF/Driver.h @@ -170,7 +170,7 @@ class LinkerDriver { std::set visitedLibs; - Symbol *addUndefined(StringRef sym); + Symbol *addUndefined(StringRef sym, bool aliasEC = false); void addUndefinedGlob(StringRef arg); diff --git a/lld/test/COFF/arm64ec-delayimport.test b/lld/test/COFF/arm64ec-delayimport.test index a0236d902eeab..6797d84e08868 100644 --- a/lld/test/COFF/arm64ec-delayimport.test +++ b/lld/test/COFF/arm64ec-delayimport.test @@ -2,12 +2,14 @@ REQUIRES: aarch64, x86 RUN: split-file %s %t.dir && cd %t.dir RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows helper-mangled.s -o helper-mangled.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows helper-demangled.s -o helper-demangled.obj RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib RUN: llvm-lib -machine:arm64ec -def:test2.def -out:test2-arm64ec.lib RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj test.obj \ -RUN: test-arm64ec.lib test2-arm64ec.lib -delayload:test.dll -map +RUN: helper-mangled.obj test-arm64ec.lib test2-arm64ec.lib -delayload:test.dll -map RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s TESTSEC: 0x180008000 00600000 88700000 00200000 10100000 @@ -97,7 +99,7 @@ IMPORTS-NEXT: } IMPORTS-NEXT: } RUN: FileCheck --check-prefix=MAP %s < out.map -MAP: 0001:00000008 #__delayLoadHelper2 0000000180001008 test.obj +MAP: 0001:00000008 #__delayLoadHelper2 0000000180001008 helper-mangled.obj MAP: 0001:00000010 #func 0000000180001010 test-arm64ec:test.dll MAP-NEXT: 0001:0000001c __impchk_func 000000018000101c test-arm64ec:test.dll MAP-NEXT: 0001:00000030 #func2 0000000180001030 test-arm64ec:test.dll @@ -138,6 +140,21 @@ RELOC-NEXT: Type: DIR64 RELOC-NEXT: Address: 0x6008 RELOC-NEXT: } +Verify that a demangled version of __delayLoadHelper2 can be used. + +RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj test.obj \ +RUN: helper-demangled.obj test-arm64ec.lib test2-arm64ec.lib -delayload:test.dll +RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s + +Verify that the mangled version of __delayLoadHelper2 can be used from a library. +Even if an anti-dependency alias is defined by the helper, it won't appear in +the archive index, so we need to locate it by its mangled name. + +RUN: llvm-lib -machine:arm64ec -out:helper.lib helper-mangled.obj +RUN: lld-link -machine:arm64ec -dll -noentry -out:out3.dll loadconfig-arm64ec.obj test.obj \ +RUN: helper.lib test-arm64ec.lib test2-arm64ec.lib -delayload:test.dll +RUN: llvm-objdump -d out3.dll | FileCheck --check-prefix=DISASM %s + #--- test.s .section .test,"r" .rva __imp_func @@ -159,16 +176,6 @@ __icall_helper_arm64ec: mov w0, #0 ret - .section .text,"xr",discard,"#__delayLoadHelper2" - .globl "#__delayLoadHelper2" - .p2align 2, 0x0 -"#__delayLoadHelper2": - mov w0, #1 - ret - - .weak_anti_dep __delayLoadHelper2 -.set __delayLoadHelper2,"#__delayLoadHelper2" - .section .hybmp$x, "yi" .symidx __imp_func .symidx func_exit_thunk @@ -189,6 +196,25 @@ func2_exit_thunk: mov w0, #3 ret +#--- helper-mangled.s + .section .text,"xr",discard,"#__delayLoadHelper2" + .globl "#__delayLoadHelper2" + .p2align 2, 0x0 +"#__delayLoadHelper2": + mov w0, #1 + ret + + .weak_anti_dep __delayLoadHelper2 +.set __delayLoadHelper2,"#__delayLoadHelper2" + +#--- helper-demangled.s + .section .text,"xr",discard,__delayLoadHelper2 + .globl __delayLoadHelper2 + .p2align 2, 0x0 +__delayLoadHelper2: + mov w0, #1 + ret + #--- test.def NAME test.dll EXPORTS diff --git a/lld/test/COFF/arm64ec-entry-mangle.test b/lld/test/COFF/arm64ec-entry-mangle.test new file mode 100644 index 0000000000000..65283f16d02fa --- /dev/null +++ b/lld/test/COFF/arm64ec-entry-mangle.test @@ -0,0 +1,129 @@ +REQUIRES: aarch64, x86 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows demangled-dll-main.s -o demangled-dll-main.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows mangled-dll-main.s -o mangled-dll-main.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows demangled-func.s -o demangled-func.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows mangled-func.s -o mangled-func.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ref-demangled.s -o ref-demangled.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows demangled-entry-drectve.s -o demangled-entry-drectve.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows demangled-dll-main.s -o x64-dll-main.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj + +RUN: llvm-lib -machine:arm64ec -out:func.lib mangled-func.obj +RUN: llvm-lib -machine:arm64ec -out:dllmain.lib mangled-dll-main.obj + +Ensure that the linker recognizes the demangled version of _DllMainCRTStartup. +RUN: lld-link -machine:arm64ec -dll -out:demangled-main.dll demangled-dll-main.obj loadconfig-arm64ec.obj +RUN: llvm-objdump -d demangled-main.dll | FileCheck -check-prefix=DISASM %s + +DISASM: 0000000180001000 <.text>: +DISASM-NEXT: 180001000: d65f03c0 ret +DISASM-EMPTY: +DISASM-NEXT: Disassembly of section .hexpthk: +DISASM-EMPTY: +DISASM: 180002000: 48 8b c4 movq %rsp, %rax +DISASM-NEXT: 180002003: 48 89 58 20 movq %rbx, 0x20(%rax) +DISASM-NEXT: 180002007: 55 pushq %rbp +DISASM-NEXT: 180002008: 5d popq %rbp +DISASM-NEXT: 180002009: e9 f2 ef ff ff jmp 0x180001000 <.text> +DISASM-NEXT: 18000200e: cc int3 +DISASM-NEXT: 18000200f: cc int3 + +Ensure that the linker recognizes the mangled version of #_DllMainCRTStartup. +RUN: lld-link -machine:arm64ec -dll -out:mangled-dllmain.dll mangled-dll-main.obj loadconfig-arm64ec.obj +RUN: llvm-objdump -d mangled-dllmain.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the mangled version of _DllMainCRTStartup from an archive. +RUN: lld-link -machine:arm64ec -dll -out:mangled-lib-dllmain.dll dllmain.lib loadconfig-arm64ec.obj +RUN: llvm-objdump -d mangled-lib-dllmain.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the demangled entry function. +RUN: lld-link -machine:arm64ec -dll -out:demangled-entry.dll demangled-func.obj loadconfig-arm64ec.obj -entry:func +RUN: llvm-objdump -d demangled-entry.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the mangled entry function when it is referenced by its demangled name. +RUN: lld-link -machine:arm64ec -dll -out:mangled-entry.dll mangled-func.obj loadconfig-arm64ec.obj -entry:func +RUN: llvm-objdump -d mangled-entry.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the mangled entry function when it is referenced by its demangled +name in drectve section. +RUN: lld-link -machine:arm64ec -dll -out:mangled-entry.dll mangled-func.obj loadconfig-arm64ec.obj demangled-entry-drectve.obj +RUN: llvm-objdump -d mangled-entry.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the mangled entry function from an archive. +RUN: lld-link -machine:arm64ec -dll -out:mangled-lib-entry.dll func.lib loadconfig-arm64ec.obj -entry:func +RUN: llvm-objdump -d mangled-lib-entry.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the entry function when referenced by its mangled name. +RUN: lld-link -machine:arm64ec -dll -out:mangled-entry2.dll mangled-func.obj loadconfig-arm64ec.obj "-entry:#func" +RUN: llvm-objdump -d mangled-entry2.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the demangled exported function. +RUN: lld-link -machine:arm64ec -dll -out:demangled-export.dll demangled-func.obj \ +RUN: loadconfig-arm64ec.obj -noentry -export:func +RUN: llvm-objdump -d demangled-export.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the mangled exported function when referenced by its demangled name. +RUN: lld-link -machine:arm64ec -dll -out:mangled-export.dll mangled-func.obj \ +RUN: loadconfig-arm64ec.obj -noentry -export:func +RUN: llvm-objdump -d mangled-export.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the mangled exported function when referenced by its mangled name. +RUN: lld-link -machine:arm64ec -dll -out:mangled-export2.dll mangled-func.obj \ +RUN: loadconfig-arm64ec.obj -noentry "-export:#func" +RUN: llvm-objdump -d mangled-export2.dll | FileCheck -check-prefix=DISASM %s + +Verify that the linker recognizes the mangled exported function when referenced +by its mangled name and creates a demangled alias for it. +RUN: lld-link -machine:arm64ec -dll -noentry -out:demangled-export-ref.dll mangled-func.obj \ +RUN: ref-demangled.obj loadconfig-arm64ec.obj "-export:#func" +RUN: llvm-objdump -d demangled-export-ref.dll | FileCheck -check-prefix=DISASM %s + +DISASM2: 0000000180001000 <.text>: +DISASM2-NEXT: 180001000: d65f03c0 ret + +Verify that the linker emits appropriate errors for mismatched mangling. +RUN: not lld-link -machine:arm64ec -dll -out:test.dll demangled-func.obj loadconfig-arm64ec.obj \ +RUN: "-entry:#func" 2>&1 | FileCheck -check-prefix=FUNC-NOT-FOUND %s +RUN: not lld-link -machine:arm64ec -dll -out:test.dll demangled-func.obj loadconfig-arm64ec.obj \ +RUN: -noentry "-export:#func" 2>&1 | FileCheck -check-prefix=FUNC-NOT-FOUND %s +FUNC-NOT-FOUND: undefined symbol: #func + +Verify that the linker recognizes the demangled x86_64 _DllMainCRTStartup. +RUN: lld-link -machine:arm64ec -dll -out:test.dll x64-dll-main.obj loadconfig-arm64ec.obj +RUN: llvm-objdump -d test.dll | FileCheck -check-prefix=DISASM-X64 %s +DISASM-X64: 0000000180001000 <.text>: +DISASM-X64-NEXT: 180001000: c3 retq + +#--- demangled-dll-main.s + .text + .globl _DllMainCRTStartup +_DllMainCRTStartup: + ret + +#--- mangled-dll-main.s + .text + .globl "#_DllMainCRTStartup" +"#_DllMainCRTStartup": + ret + +#--- demangled-func.s + .text + .globl func +func: + ret + +#--- mangled-func.s + .text + .globl "#func" +"#func": + ret + +#--- ref-demangled.s + .data + .rva func + +#--- demangled-entry-drectve.s + .section .drectve,"rd" + .ascii " -entry:func" diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst index e2c15d872b4be..fb22bdecad37e 100644 --- a/lldb/docs/index.rst +++ b/lldb/docs/index.rst @@ -181,6 +181,7 @@ interesting areas to contribute to lldb. Source Code Releases + Discord Discussion Forums Developer Policy Bug Reports diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocation.h b/lldb/include/lldb/Breakpoint/BreakpointLocation.h index cca00335bc3c6..3592291bb2d06 100644 --- a/lldb/include/lldb/Breakpoint/BreakpointLocation.h +++ b/lldb/include/lldb/Breakpoint/BreakpointLocation.h @@ -11,10 +11,12 @@ #include #include +#include #include "lldb/Breakpoint/BreakpointOptions.h" #include "lldb/Breakpoint/StoppointHitCounter.h" #include "lldb/Core/Address.h" +#include "lldb/Symbol/LineEntry.h" #include "lldb/Utility/UserID.h" #include "lldb/lldb-private.h" @@ -282,6 +284,25 @@ class BreakpointLocation /// Returns the breakpoint location ID. lldb::break_id_t GetID() const { return m_loc_id; } + /// Set the line entry that should be shown to users for this location. + /// It is up to the caller to verify that this is a valid entry to show. + /// The current use of this is to distinguish among line entries from a + /// virtual inlined call stack that all share the same address. + /// The line entry must have the same start address as the address for this + /// location. + bool SetPreferredLineEntry(const LineEntry &line_entry) { + if (m_address == line_entry.range.GetBaseAddress()) { + m_preferred_line_entry = line_entry; + return true; + } + assert(0 && "Tried to set a preferred line entry with a different address"); + return false; + } + + const std::optional GetPreferredLineEntry() { + return m_preferred_line_entry; + } + protected: friend class BreakpointSite; friend class BreakpointLocationList; @@ -306,6 +327,16 @@ class BreakpointLocation /// If it returns false we should continue, otherwise stop. bool IgnoreCountShouldStop(); + /// If this location knows that the virtual stack frame it represents is + /// not frame 0, return the suggested stack frame instead. This will happen + /// when the location's address contains a "virtual inlined call stack" and + /// the breakpoint was set on a file & line that are not at the bottom of that + /// stack. For now we key off the "preferred line entry" - looking for that + /// in the blocks that start with the stop PC. + /// This version of the API doesn't take an "inlined" parameter because it + /// only changes frames in the inline stack. + std::optional GetSuggestedStackFrameIndex(); + private: void SwapLocation(lldb::BreakpointLocationSP swap_from); @@ -369,6 +400,11 @@ class BreakpointLocation lldb::break_id_t m_loc_id; ///< Breakpoint location ID. StoppointHitCounter m_hit_counter; ///< Number of times this breakpoint /// location has been hit. + /// If this exists, use it to print the stop description rather than the + /// LineEntry m_address resolves to directly. Use this for instance when the + /// location was given somewhere in the virtual inlined call stack since the + /// Address always resolves to the lowest entry in the stack. + std::optional m_preferred_line_entry; void SetShouldResolveIndirectFunctions(bool do_resolve) { m_should_resolve_indirect_functions = do_resolve; diff --git a/lldb/include/lldb/Breakpoint/BreakpointSite.h b/lldb/include/lldb/Breakpoint/BreakpointSite.h index 17b76d51c1ae5..7b3f7be23639f 100644 --- a/lldb/include/lldb/Breakpoint/BreakpointSite.h +++ b/lldb/include/lldb/Breakpoint/BreakpointSite.h @@ -170,6 +170,11 @@ class BreakpointSite : public std::enable_shared_from_this, /// \see lldb::DescriptionLevel void GetDescription(Stream *s, lldb::DescriptionLevel level); + // This runs through all the breakpoint locations owning this site and returns + // the greatest of their suggested stack frame indexes. This only handles + // inlined stack changes. + std::optional GetSuggestedStackFrameIndex(); + /// Tell whether a breakpoint has a location at this site. /// /// \param[in] bp_id diff --git a/lldb/include/lldb/Core/Declaration.h b/lldb/include/lldb/Core/Declaration.h index 4a0e9047b5469..c864b88c6b32a 100644 --- a/lldb/include/lldb/Core/Declaration.h +++ b/lldb/include/lldb/Core/Declaration.h @@ -84,10 +84,14 @@ class Declaration { /// \param[in] declaration /// The const Declaration object to compare with. /// + /// \param[in] full + /// Same meaning as Full in FileSpec::Equal. True means an empty + /// directory is not equal to a specified one, false means it is equal. + /// /// \return /// Returns \b true if \b declaration is at the same file and /// line, \b false otherwise. - bool FileAndLineEqual(const Declaration &declaration) const; + bool FileAndLineEqual(const Declaration &declaration, bool full) const; /// Dump a description of this object to a Stream. /// diff --git a/lldb/include/lldb/Host/Editline.h b/lldb/include/lldb/Host/Editline.h index a02f90891599a..57e2c831e3499 100644 --- a/lldb/include/lldb/Host/Editline.h +++ b/lldb/include/lldb/Host/Editline.h @@ -30,9 +30,6 @@ #include "lldb/Host/Config.h" -#if LLDB_EDITLINE_USE_WCHAR -#include -#endif #include #include #include @@ -57,23 +54,6 @@ #include "llvm/ADT/FunctionExtras.h" -#if defined(__clang__) && defined(__has_warning) -#if __has_warning("-Wdeprecated-declarations") -#define LLDB_DEPRECATED_WARNING_DISABLE \ - _Pragma("clang diagnostic push") \ - _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") -#define LLDB_DEPRECATED_WARNING_RESTORE _Pragma("clang diagnostic pop") -#endif -#elif defined(__GNUC__) && __GNUC__ > 6 -#define LLDB_DEPRECATED_WARNING_DISABLE \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -#define LLDB_DEPRECATED_WARNING_RESTORE _Pragma("GCC diagnostic pop") -#else -#define LLDB_DEPRECATED_WARNING_DISABLE -#define LLDB_DEPRECATED_WARNING_RESTORE -#endif - namespace lldb_private { namespace line_editor { @@ -383,11 +363,6 @@ class Editline { void SetEditLinePromptCallback(EditlinePromptCallbackType callbackFn); void SetGetCharacterFunction(EditlineGetCharCallbackType callbackFn); -#if LLDB_EDITLINE_USE_WCHAR - LLDB_DEPRECATED_WARNING_DISABLE - std::wstring_convert> m_utf8conv; - LLDB_DEPRECATED_WARNING_RESTORE -#endif ::EditLine *m_editline = nullptr; EditlineHistorySP m_history_sp; bool m_in_history = false; diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h index 03d9f92799747..91188fe6ea483 100644 --- a/lldb/include/lldb/Symbol/Type.h +++ b/lldb/include/lldb/Symbol/Type.h @@ -84,6 +84,9 @@ FLAGS_ENUM(TypeQueryOptions){ /// matching type is found. When false, the type query should find all /// matching types. e_find_one = (1u << 4), + // If set, treat TypeQuery::m_name as a mangled name that should be + // searched. + e_search_by_mangled_name = (1u << 5), }; LLDB_MARK_AS_BITMASK_ENUM(TypeQueryOptions) @@ -300,6 +303,19 @@ class TypeQuery { m_options &= ~e_find_one; } + /// Returns true if the type query is supposed to treat the name to be + /// searched as a mangled name. + bool GetSearchByMangledName() const { + return (m_options & e_search_by_mangled_name) != 0; + } + + void SetSearchByMangledName(bool b) { + if (b) + m_options |= e_search_by_mangled_name; + else + m_options &= ~e_search_by_mangled_name; + } + /// Access the internal compiler context array. /// /// Clients can use this to populate the context manually. diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h index fae90364deaf0..45beac129e86f 100644 --- a/lldb/include/lldb/Target/StopInfo.h +++ b/lldb/include/lldb/Target/StopInfo.h @@ -77,6 +77,18 @@ class StopInfo : public std::enable_shared_from_this { m_description.clear(); } + /// This gives the StopInfo a chance to suggest a stack frame to select. + /// Passing true for inlined_stack will request changes to the inlined + /// call stack. Passing false will request changes to the real stack + /// frame. The inlined stack gets adjusted before we call into the thread + /// plans so they can reason based on the correct values. The real stack + /// adjustment is handled after the frame recognizers get a chance to adjust + /// the frame. + virtual std::optional + GetSuggestedStackFrameIndex(bool inlined_stack) { + return {}; + } + virtual bool IsValidForOperatingSystemThread(Thread &thread) { return true; } /// A Continue operation can result in a false stop event diff --git a/lldb/include/lldb/Target/ThreadPlanStepInRange.h b/lldb/include/lldb/Target/ThreadPlanStepInRange.h index f9ef87942a7c0..9da8370ef1c92 100644 --- a/lldb/include/lldb/Target/ThreadPlanStepInRange.h +++ b/lldb/include/lldb/Target/ThreadPlanStepInRange.h @@ -80,8 +80,8 @@ class ThreadPlanStepInRange : public ThreadPlanStepRange, bool m_step_past_prologue; // FIXME: For now hard-coded to true, we could put // a switch in for this if there's // demand for that. - bool m_virtual_step; // true if we've just done a "virtual step", i.e. just - // moved the inline stack depth. + LazyBool m_virtual_step; // true if we've just done a "virtual step", i.e. + // just moved the inline stack depth. ConstString m_step_into_target; ThreadPlanStepInRange(const ThreadPlanStepInRange &) = delete; const ThreadPlanStepInRange & diff --git a/lldb/packages/Python/lldbsuite/test/configuration.py b/lldb/packages/Python/lldbsuite/test/configuration.py index 1bacd74a968c3..bcc179346836d 100644 --- a/lldb/packages/Python/lldbsuite/test/configuration.py +++ b/lldb/packages/Python/lldbsuite/test/configuration.py @@ -46,6 +46,10 @@ make_path = None # The overriden dwarf verison. +# Don't use this to test the current compiler's +# DWARF version, as this won't be set if the +# version isn't overridden. +# Use lldbplatformutils.getDwarfVersion() instead. dwarf_version = 0 # Any overridden settings. diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp index ad9057c8141e9..c7ea50407ae1c 100644 --- a/lldb/source/Breakpoint/BreakpointLocation.cpp +++ b/lldb/source/Breakpoint/BreakpointLocation.cpp @@ -508,8 +508,20 @@ void BreakpointLocation::GetDescription(Stream *s, s->PutCString("re-exported target = "); else s->PutCString("where = "); + + // If there's a preferred line entry for printing, use that. + bool show_function_info = true; + if (auto preferred = GetPreferredLineEntry()) { + sc.line_entry = *preferred; + // FIXME: We're going to get the function name wrong when the preferred + // line entry is not the lowest one. For now, just leave the function + // out in this case, but we really should also figure out how to easily + // fake the function name here. + show_function_info = false; + } sc.DumpStopContext(s, m_owner.GetTarget().GetProcessSP().get(), m_address, - false, true, false, true, true, true); + false, true, false, show_function_info, + show_function_info, show_function_info); } else { if (sc.module_sp) { s->EOL(); @@ -537,7 +549,10 @@ void BreakpointLocation::GetDescription(Stream *s, if (sc.line_entry.line > 0) { s->EOL(); s->Indent("location = "); - sc.line_entry.DumpStopContext(s, true); + if (auto preferred = GetPreferredLineEntry()) + preferred->DumpStopContext(s, true); + else + sc.line_entry.DumpStopContext(s, true); } } else { @@ -656,6 +671,50 @@ void BreakpointLocation::SendBreakpointLocationChangedEvent( } } +std::optional BreakpointLocation::GetSuggestedStackFrameIndex() { + auto preferred_opt = GetPreferredLineEntry(); + if (!preferred_opt) + return {}; + LineEntry preferred = *preferred_opt; + SymbolContext sc; + if (!m_address.CalculateSymbolContext(&sc)) + return {}; + // Don't return anything special if frame 0 is the preferred line entry. + // We not really telling the stack frame list to do anything special in that + // case. + if (!LineEntry::Compare(sc.line_entry, preferred)) + return {}; + + if (!sc.block) + return {}; + + // Blocks have their line info in Declaration form, so make one here: + Declaration preferred_decl(preferred.GetFile(), preferred.line, + preferred.column); + + uint32_t depth = 0; + Block *inlined_block = sc.block->GetContainingInlinedBlock(); + while (inlined_block) { + // If we've moved to a block that this isn't the start of, that's not + // our inlining info or call site, so we can stop here. + Address start_address; + if (!inlined_block->GetStartAddress(start_address) || + start_address != m_address) + return {}; + + const InlineFunctionInfo *info = inlined_block->GetInlinedFunctionInfo(); + if (info) { + if (preferred_decl == info->GetDeclaration()) + return depth; + if (preferred_decl == info->GetCallSite()) + return depth + 1; + } + inlined_block = inlined_block->GetInlinedParent(); + depth++; + } + return {}; +} + void BreakpointLocation::SwapLocation(BreakpointLocationSP swap_from) { m_address = swap_from->m_address; m_should_resolve_indirect_functions = diff --git a/lldb/source/Breakpoint/BreakpointResolver.cpp b/lldb/source/Breakpoint/BreakpointResolver.cpp index 8307689c7640c..9643602d78c75 100644 --- a/lldb/source/Breakpoint/BreakpointResolver.cpp +++ b/lldb/source/Breakpoint/BreakpointResolver.cpp @@ -340,6 +340,21 @@ void BreakpointResolver::AddLocation(SearchFilter &filter, } BreakpointLocationSP bp_loc_sp(AddLocation(line_start)); + // If the address that we resolved the location to returns a different + // LineEntry from the one in the incoming SC, we're probably dealing with an + // inlined call site, so set that as the preferred LineEntry: + LineEntry resolved_entry; + if (!skipped_prologue && bp_loc_sp && + line_start.CalculateSymbolContextLineEntry(resolved_entry) && + LineEntry::Compare(resolved_entry, sc.line_entry)) { + // FIXME: The function name will also be wrong here. Do we need to record + // that as well, or can we figure that out again when we report this + // breakpoint location. + if (!bp_loc_sp->SetPreferredLineEntry(sc.line_entry)) { + LLDB_LOG(log, "Tried to add a preferred line entry that didn't have the " + "same address as this location's address."); + } + } if (log && bp_loc_sp && !GetBreakpoint()->IsInternal()) { StreamString s; bp_loc_sp->GetDescription(&s, lldb::eDescriptionLevelVerbose); diff --git a/lldb/source/Breakpoint/BreakpointSite.cpp b/lldb/source/Breakpoint/BreakpointSite.cpp index 3ca93f908e30b..9700a57d3346e 100644 --- a/lldb/source/Breakpoint/BreakpointSite.cpp +++ b/lldb/source/Breakpoint/BreakpointSite.cpp @@ -87,6 +87,23 @@ void BreakpointSite::GetDescription(Stream *s, lldb::DescriptionLevel level) { m_constituents.GetDescription(s, level); } +std::optional BreakpointSite::GetSuggestedStackFrameIndex() { + + std::optional result; + std::lock_guard guard(m_constituents_mutex); + for (BreakpointLocationSP loc_sp : m_constituents.BreakpointLocations()) { + std::optional loc_frame_index = + loc_sp->GetSuggestedStackFrameIndex(); + if (loc_frame_index) { + if (result) + result = std::max(*loc_frame_index, *result); + else + result = loc_frame_index; + } + } + return result; +} + bool BreakpointSite::IsInternal() const { return m_constituents.IsInternal(); } uint8_t *BreakpointSite::GetTrapOpcodeBytes() { return &m_trap_opcode[0]; } diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp index f9786529bcdb1..e4c6e374446e8 100644 --- a/lldb/source/Commands/CommandObjectType.cpp +++ b/lldb/source/Commands/CommandObjectType.cpp @@ -2649,6 +2649,8 @@ class CommandObjectTypeLookup : public CommandObjectRaw { return false; LanguageType lt1 = lang1->GetLanguageType(); LanguageType lt2 = lang2->GetLanguageType(); + if (lt1 == lt2) + return false; if (lt1 == guessed_language) return true; // make the selected frame's language come first if (lt2 == guessed_language) diff --git a/lldb/source/Core/Declaration.cpp b/lldb/source/Core/Declaration.cpp index 579a3999d14ea..a485c4b9ba48a 100644 --- a/lldb/source/Core/Declaration.cpp +++ b/lldb/source/Core/Declaration.cpp @@ -70,8 +70,9 @@ int Declaration::Compare(const Declaration &a, const Declaration &b) { return 0; } -bool Declaration::FileAndLineEqual(const Declaration &declaration) const { - int file_compare = FileSpec::Compare(this->m_file, declaration.m_file, true); +bool Declaration::FileAndLineEqual(const Declaration &declaration, + bool full) const { + int file_compare = FileSpec::Compare(this->m_file, declaration.m_file, full); return file_compare == 0 && this->m_line == declaration.m_line; } diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp index 60117cb5f0e61..f95f854c5f220 100644 --- a/lldb/source/Host/common/Editline.cpp +++ b/lldb/source/Host/common/Editline.cpp @@ -10,9 +10,8 @@ #include #include -#include "lldb/Host/Editline.h" - #include "lldb/Host/ConnectionFileDescriptor.h" +#include "lldb/Host/Editline.h" #include "lldb/Host/FileSystem.h" #include "lldb/Host/Host.h" #include "lldb/Utility/CompletionRequest.h" @@ -23,6 +22,7 @@ #include "lldb/Utility/StreamString.h" #include "lldb/Utility/StringList.h" #include "lldb/Utility/Timeout.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Locale.h" @@ -444,7 +444,9 @@ StringList Editline::GetInputAsStringList(int line_count) { if (line_count == 0) break; #if LLDB_EDITLINE_USE_WCHAR - lines.AppendString(m_utf8conv.to_bytes(line)); + std::string buffer; + llvm::convertWideToUTF8(line, buffer); + lines.AppendString(buffer); #else lines.AppendString(line); #endif @@ -636,7 +638,9 @@ unsigned char Editline::BreakLineCommand(int ch) { if (m_fix_indentation_callback) { StringList lines = GetInputAsStringList(m_current_line_index + 1); #if LLDB_EDITLINE_USE_WCHAR - lines.AppendString(m_utf8conv.to_bytes(new_line_fragment)); + std::string buffer; + llvm::convertWideToUTF8(new_line_fragment, buffer); + lines.AppendString(buffer); #else lines.AppendString(new_line_fragment); #endif @@ -684,8 +688,9 @@ unsigned char Editline::EndOrAddLineCommand(int ch) { m_input_lines.clear(); for (unsigned index = 0; index < lines.GetSize(); index++) { #if LLDB_EDITLINE_USE_WCHAR - m_input_lines.insert(m_input_lines.end(), - m_utf8conv.from_bytes(lines[index])); + std::wstring wbuffer; + llvm::ConvertUTF8toWide(lines[index], wbuffer); + m_input_lines.insert(m_input_lines.end(), wbuffer); #else m_input_lines.insert(m_input_lines.end(), lines[index]); #endif @@ -869,7 +874,9 @@ unsigned char Editline::FixIndentationCommand(int ch) { currentLine = currentLine.erase(0, -indent_correction); } #if LLDB_EDITLINE_USE_WCHAR - m_input_lines[m_current_line_index] = m_utf8conv.from_bytes(currentLine); + std::wstring wbuffer; + llvm::ConvertUTF8toWide(currentLine, wbuffer); + m_input_lines[m_current_line_index] = wbuffer; #else m_input_lines[m_current_line_index] = currentLine; #endif @@ -1502,7 +1509,7 @@ bool Editline::GetLine(std::string &line, bool &interrupted) { } else { m_history_sp->Enter(input); #if LLDB_EDITLINE_USE_WCHAR - line = m_utf8conv.to_bytes(SplitLines(input)[0]); + llvm::convertWideToUTF8(SplitLines(input)[0], line); #else line = SplitLines(input)[0]; #endif @@ -1574,25 +1581,22 @@ bool Editline::CompleteCharacter(char ch, EditLineGetCharType &out) { out = (unsigned char)ch; return true; #else - LLDB_DEPRECATED_WARNING_DISABLE - std::codecvt_utf8 cvt; - LLDB_DEPRECATED_WARNING_RESTORE llvm::SmallString<4> input; for (;;) { - const char *from_next; - wchar_t *to_next; - std::mbstate_t state = std::mbstate_t(); input.push_back(ch); - switch (cvt.in(state, input.begin(), input.end(), from_next, &out, &out + 1, - to_next)) { - case std::codecvt_base::ok: + auto *cur_ptr = reinterpret_cast(input.begin()); + auto *end_ptr = reinterpret_cast(input.end()); + llvm::UTF32 code_point = 0; + llvm::ConversionResult cr = llvm::convertUTF8Sequence( + &cur_ptr, end_ptr, &code_point, llvm::lenientConversion); + switch (cr) { + case llvm::conversionOK: + out = code_point; return out != (EditLineGetCharType)WEOF; - - case std::codecvt_base::error: - case std::codecvt_base::noconv: + case llvm::targetExhausted: + case llvm::sourceIllegal: return false; - - case std::codecvt_base::partial: + case llvm::sourceExhausted: lldb::ConnectionStatus status; size_t read_count = m_input_connection.Read( &ch, 1, std::chrono::seconds(0), status, nullptr); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp index 90ccd1055199a..a0f8cf954f804 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp @@ -71,12 +71,12 @@ Expected python::As(Expected &&obj) { } static bool python_is_finalizing() { -#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 13) || (PY_MAJOR_VERSION > 3) +#if PY_VERSION_HEX >= 0x030d0000 return Py_IsFinalizing(); -#elif PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 7 - return _Py_Finalizing != nullptr; -#else +#elif PY_VERSION_HEX >= 0x03070000 return _Py_IsFinalizing(); +#else + return _Py_Finalizing != nullptr; #endif } @@ -810,7 +810,7 @@ bool PythonCallable::Check(PyObject *py_obj) { return PyCallable_Check(py_obj); } -#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3 +#if PY_VERSION_HEX >= 0x03030000 static const char get_arg_info_script[] = R"( from inspect import signature, Parameter, ismethod from collections import namedtuple @@ -839,7 +839,7 @@ Expected PythonCallable::GetArgInfo() const { if (!IsValid()) return nullDeref(); -#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3 +#if PY_VERSION_HEX >= 0x03030000 // no need to synchronize access to this global, we already have the GIL static PythonScript get_arg_info(get_arg_info_script); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 7cc38da6a6a94..ef3c53ca5698d 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -71,8 +71,7 @@ extern "C" PyObject *PyInit__lldb(void); #define LLDB_USE_PYTHON_SET_INTERRUPT 0 #else // PyErr_SetInterrupt was introduced in 3.2. -#define LLDB_USE_PYTHON_SET_INTERRUPT \ - (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 2) || (PY_MAJOR_VERSION > 3) +#define LLDB_USE_PYTHON_SET_INTERRUPT PY_VERSION_HEX >= 0x03020000 #endif static ScriptInterpreterPythonImpl *GetPythonInterpreter(Debugger &debugger) { @@ -92,7 +91,33 @@ namespace { struct InitializePythonRAII { public: InitializePythonRAII() { - InitializePythonHome(); +#if PY_VERSION_HEX >= 0x03080000 + PyConfig config; + PyConfig_InitPythonConfig(&config); +#endif + +#if LLDB_EMBED_PYTHON_HOME + static std::string g_python_home = []() -> std::string { + if (llvm::sys::path::is_absolute(LLDB_PYTHON_HOME)) + return LLDB_PYTHON_HOME; + + FileSpec spec = HostInfo::GetShlibDir(); + if (!spec) + return {}; + spec.AppendPathComponent(LLDB_PYTHON_HOME); + return spec.GetPath(); + }(); + if (!g_python_home.empty()) { +#if PY_VERSION_HEX >= 0x03080000 + PyConfig_SetBytesString(&config, &config.home, g_python_home.c_str()); +#else + size_t size = 0; + wchar_t *python_home_w = Py_DecodeLocale(g_python_home.c_str(), &size); + Py_SetPythonHome(python_home_w); + PyMem_RawFree(python_home_w); +#endif + } +#endif // The table of built-in modules can only be extended before Python is // initialized. @@ -117,15 +142,22 @@ struct InitializePythonRAII { PyImport_AppendInittab("_lldb", LLDBSwigPyInit); } +#if PY_VERSION_HEX >= 0x03080000 + config.install_signal_handlers = 0; + Py_InitializeFromConfig(&config); + PyConfig_Clear(&config); + InitializeThreadsPrivate(); +#else // Python < 3.2 and Python >= 3.2 reversed the ordering requirements for // calling `Py_Initialize` and `PyEval_InitThreads`. < 3.2 requires that you // call `PyEval_InitThreads` first, and >= 3.2 requires that you call it last. -#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 2) || (PY_MAJOR_VERSION > 3) +#if PY_VERSION_HEX >= 0x03020000 Py_InitializeEx(0); InitializeThreadsPrivate(); #else InitializeThreadsPrivate(); Py_InitializeEx(0); +#endif #endif } @@ -142,32 +174,6 @@ struct InitializePythonRAII { } private: - void InitializePythonHome() { -#if LLDB_EMBED_PYTHON_HOME - typedef wchar_t *str_type; - static str_type g_python_home = []() -> str_type { - const char *lldb_python_home = LLDB_PYTHON_HOME; - const char *absolute_python_home = nullptr; - llvm::SmallString<64> path; - if (llvm::sys::path::is_absolute(lldb_python_home)) { - absolute_python_home = lldb_python_home; - } else { - FileSpec spec = HostInfo::GetShlibDir(); - if (!spec) - return nullptr; - spec.GetPath(path); - llvm::sys::path::append(path, lldb_python_home); - absolute_python_home = path.c_str(); - } - size_t size = 0; - return Py_DecodeLocale(absolute_python_home, &size); - }(); - if (g_python_home != nullptr) { - Py_SetPythonHome(g_python_home); - } -#endif - } - void InitializeThreadsPrivate() { // Since Python 3.7 `Py_Initialize` calls `PyEval_InitThreads` inside itself, // so there is no way to determine whether the embedded interpreter @@ -175,7 +181,7 @@ struct InitializePythonRAII { // would always return `true` and `PyGILState_Ensure/Release` flow would be // executed instead of unlocking GIL with `PyEval_SaveThread`. When // an another thread calls `PyGILState_Ensure` it would get stuck in deadlock. -#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 7) || (PY_MAJOR_VERSION > 3) +#if PY_VERSION_HEX >= 0x03070000 // The only case we should go further and acquire the GIL: it is unlocked. if (PyGILState_Check()) return; @@ -183,7 +189,7 @@ struct InitializePythonRAII { // `PyEval_ThreadsInitialized` was deprecated in Python 3.9 and removed in // Python 3.13. It has been returning `true` always since Python 3.7. -#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 9) || (PY_MAJOR_VERSION < 3) +#if PY_VERSION_HEX < 0x03090000 if (PyEval_ThreadsInitialized()) { #else if (true) { @@ -197,7 +203,7 @@ struct InitializePythonRAII { // `PyEval_InitThreads` was deprecated in Python 3.9 and removed in // Python 3.13. -#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 9) || (PY_MAJOR_VERSION < 3) +#if PY_VERSION_HEX < 0x03090000 return; } @@ -446,8 +452,9 @@ ScriptInterpreterPythonImpl::ScriptInterpreterPythonImpl(Debugger &debugger) // Reloading modules requires a different syntax in Python 2 and Python 3. // This provides a consistent syntax no matter what version of Python. run_string.Clear(); - run_string.Printf("run_one_line (%s, 'from importlib import reload as reload_module')", - m_dictionary_name.c_str()); + run_string.Printf( + "run_one_line (%s, 'from importlib import reload as reload_module')", + m_dictionary_name.c_str()); PyRun_SimpleString(run_string.GetData()); // WARNING: temporary code that loads Cocoa formatters - this should be done @@ -763,21 +770,19 @@ llvm::Expected ScriptInterpreterPythonImpl::GetMaxPositionalArgumentsForCallable( const llvm::StringRef &callable_name) { if (callable_name.empty()) { - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "called with empty callable name."); - } - Locker py_lock(this, Locker::AcquireLock | - Locker::InitSession | - Locker::NoSTDIN); - auto dict = PythonModule::MainModule() - .ResolveName(m_dictionary_name); + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "called with empty callable name."); + } + Locker py_lock(this, + Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); + auto dict = PythonModule::MainModule().ResolveName( + m_dictionary_name); auto pfunc = PythonObject::ResolveNameWithDictionary( callable_name, dict); if (!pfunc.IsAllocated()) { - return llvm::createStringError( - llvm::inconvertibleErrorCode(), - "can't find callable: %s", callable_name.str().c_str()); + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "can't find callable: %s", + callable_name.str().c_str()); } llvm::Expected arg_info = pfunc.GetArgInfo(); if (!arg_info) @@ -1259,8 +1264,7 @@ Status ScriptInterpreterPythonImpl::SetBreakpointCommandCallback( // Set a Python one-liner as the callback for the watchpoint. void ScriptInterpreterPythonImpl::SetWatchpointCommandCallback( - WatchpointOptions *wp_options, const char *user_input, - bool is_callback) { + WatchpointOptions *wp_options, const char *user_input, bool is_callback) { auto data_up = std::make_unique(); // It's necessary to set both user_source and script_source to the oneliner. @@ -1286,8 +1290,7 @@ Status ScriptInterpreterPythonImpl::ExportFunctionDefinitionToInterpreter( std::string function_def_string(function_def.CopyList()); Status error = ExecuteMultipleLines( - function_def_string.c_str(), - ExecuteScriptOptions().SetEnableIO(false)); + function_def_string.c_str(), ExecuteScriptOptions().SetEnableIO(false)); return error; } @@ -2068,7 +2071,8 @@ int ScriptInterpreterPythonImpl::GetIndexOfChildWithName( { Locker py_lock(this, Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN); - ret_val = SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName(implementor, child_name); + ret_val = SWIGBridge::LLDBSwigPython_GetIndexOfChildWithName(implementor, + child_name); } return ret_val; @@ -2460,7 +2464,8 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule( // the lifetime of the process in which this LLDB framework is living. const bool does_contain_executed = ExecuteOneLineWithReturn( command_stream.GetData(), - ScriptInterpreterPythonImpl::eScriptReturnTypeBool, &does_contain, exc_options); + ScriptInterpreterPythonImpl::eScriptReturnTypeBool, &does_contain, + exc_options); const bool was_imported_globally = does_contain_executed && does_contain; const bool was_imported_locally = @@ -2677,7 +2682,7 @@ bool ScriptInterpreterPythonImpl::RunScriptBasedParsedCommand( args_arr_sp->AddStringItem(entry.ref()); } StructuredDataImpl args_impl(args_arr_sp); - + ret_val = SWIGBridge::LLDBSwigPythonCallParsedCommandObject( static_cast(impl_obj_sp->GetValue()), debugger_sp, args_impl, cmd_retobj, exe_ctx_ref_sp); @@ -2779,8 +2784,7 @@ bool ScriptInterpreterPythonImpl::GetDocumentationForItem(const char *item, if (ExecuteOneLineWithReturn( command, ScriptInterpreter::eScriptReturnTypeCharStrOrNone, - &result_ptr, - ExecuteScriptOptions().SetEnableIO(false))) { + &result_ptr, ExecuteScriptOptions().SetEnableIO(false))) { if (result_ptr) dest.assign(result_ptr); return true; @@ -2878,7 +2882,7 @@ uint32_t ScriptInterpreterPythonImpl::GetFlagsForCommandObject( return result; } -StructuredData::ObjectSP +StructuredData::ObjectSP ScriptInterpreterPythonImpl::GetOptionsForCommandObject( StructuredData::GenericSP cmd_obj_sp) { StructuredData::ObjectSP result = {}; @@ -2923,10 +2927,10 @@ ScriptInterpreterPythonImpl::GetOptionsForCommandObject( PyErr_Clear(); return {}; } - return py_return.CreateStructuredObject(); + return py_return.CreateStructuredObject(); } -StructuredData::ObjectSP +StructuredData::ObjectSP ScriptInterpreterPythonImpl::GetArgumentsForCommandObject( StructuredData::GenericSP cmd_obj_sp) { StructuredData::ObjectSP result = {}; @@ -2971,11 +2975,10 @@ ScriptInterpreterPythonImpl::GetArgumentsForCommandObject( PyErr_Clear(); return {}; } - return py_return.CreateStructuredObject(); + return py_return.CreateStructuredObject(); } -void -ScriptInterpreterPythonImpl::OptionParsingStartedForCommandObject( +void ScriptInterpreterPythonImpl::OptionParsingStartedForCommandObject( StructuredData::GenericSP cmd_obj_sp) { Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, Locker::FreeLock); @@ -2983,7 +2986,7 @@ ScriptInterpreterPythonImpl::OptionParsingStartedForCommandObject( static char callee_name[] = "option_parsing_started"; if (!cmd_obj_sp) - return ; + return; PythonObject implementor(PyRefType::Borrowed, (PyObject *)cmd_obj_sp->GetValue()); @@ -3009,10 +3012,9 @@ ScriptInterpreterPythonImpl::OptionParsingStartedForCommandObject( if (PyErr_Occurred()) PyErr_Clear(); - // option_parsing_starting doesn't return anything, ignore anything but + // option_parsing_starting doesn't return anything, ignore anything but // python errors. - unwrapOrSetPythonException( - As(implementor.CallMethod(callee_name))); + unwrapOrSetPythonException(As(implementor.CallMethod(callee_name))); // if it fails, print the error but otherwise go on if (PyErr_Occurred()) { @@ -3022,8 +3024,7 @@ ScriptInterpreterPythonImpl::OptionParsingStartedForCommandObject( } } -bool -ScriptInterpreterPythonImpl::SetOptionValueForCommandObject( +bool ScriptInterpreterPythonImpl::SetOptionValueForCommandObject( StructuredData::GenericSP cmd_obj_sp, ExecutionContext *exe_ctx, llvm::StringRef long_option, llvm::StringRef value) { StructuredData::ObjectSP result = {}; @@ -3058,15 +3059,15 @@ ScriptInterpreterPythonImpl::SetOptionValueForCommandObject( if (PyErr_Occurred()) PyErr_Clear(); - + lldb::ExecutionContextRefSP exe_ctx_ref_sp; if (exe_ctx) exe_ctx_ref_sp.reset(new ExecutionContextRef(exe_ctx)); PythonObject ctx_ref_obj = SWIGBridge::ToSWIGWrapper(exe_ctx_ref_sp); - - bool py_return = unwrapOrSetPythonException( - As(implementor.CallMethod(callee_name, ctx_ref_obj, long_option.str().c_str(), - value.str().c_str()))); + + bool py_return = unwrapOrSetPythonException(As( + implementor.CallMethod(callee_name, ctx_ref_obj, + long_option.str().c_str(), value.str().c_str()))); // if it fails, print the error but otherwise go on if (PyErr_Occurred()) { diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/lldb-python.h b/lldb/source/Plugins/ScriptInterpreter/Python/lldb-python.h index 378b9fa2a5986..b68598b9d59d9 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/lldb-python.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/lldb-python.h @@ -47,6 +47,11 @@ static llvm::Expected *g_fcxx_modules_workaround [[maybe_unused]]; // Include python for non windows machines #include + +// Provide a meaningful diagnostic error if someone tries to compile this file +// with a version of Python we don't support. +static_assert(PY_VERSION_HEX >= 0x03000000, + "LLDB requires at least Python 3.0"); #endif #endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_LLDB_PYTHON_H diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp index d83740f8e2113..4c9f1d8505f6e 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp @@ -199,9 +199,9 @@ DWARFDIE::LookupDeepestBlock(lldb::addr_t address) const { return result; } -const char *DWARFDIE::GetMangledName() const { +const char *DWARFDIE::GetMangledName(bool substitute_name_allowed) const { if (IsValid()) - return m_die->GetMangledName(m_cu); + return m_die->GetMangledName(m_cu, substitute_name_allowed); else return nullptr; } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h index e1318953a384c..077b78eb26d0c 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.h @@ -28,7 +28,7 @@ class DWARFDIE : public DWARFBaseDIE { // Accessors // Accessing information about a DIE - const char *GetMangledName() const; + const char *GetMangledName(bool substitute_name_allowed = true) const; const char *GetPubname() const; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index f23f8cc3d781d..f39189b6cead4 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -2758,6 +2758,20 @@ void SymbolFileDWARF::FindTypes(const TypeQuery &query, TypeResults &results) { return true; // Keep iterating over index types, language mismatch. } + // Since mangled names are unique, we only need to check if the names are + // the same. + if (query.GetSearchByMangledName()) { + if (die.GetMangledName(/*substitute_name_allowed=*/false) != + query.GetTypeBasename().GetStringRef()) + return true; // Keep iterating over index types, mangled name mismatch. + if (Type *matching_type = ResolveType(die, true, true)) { + results.InsertUnique(matching_type->shared_from_this()); + return !results.Done(query); // Keep iterating if we aren't done. + } + return true; // Keep iterating over index types, weren't able to resolve + // this type + } + // Check the context matches std::vector die_context; if (query.GetModuleSearch()) diff --git a/lldb/source/Symbol/Block.cpp b/lldb/source/Symbol/Block.cpp index f7d9c0d2d3306..5c7772a6db780 100644 --- a/lldb/source/Symbol/Block.cpp +++ b/lldb/source/Symbol/Block.cpp @@ -230,7 +230,7 @@ Block *Block::GetContainingInlinedBlockWithCallSite( const auto *function_info = inlined_block->GetInlinedFunctionInfo(); if (function_info && - function_info->GetCallSite().FileAndLineEqual(find_call_site)) + function_info->GetCallSite().FileAndLineEqual(find_call_site, true)) return inlined_block; inlined_block = inlined_block->GetInlinedParent(); } diff --git a/lldb/source/Symbol/CompileUnit.cpp b/lldb/source/Symbol/CompileUnit.cpp index db8f8ce6bcbc9..73389b2e8479b 100644 --- a/lldb/source/Symbol/CompileUnit.cpp +++ b/lldb/source/Symbol/CompileUnit.cpp @@ -251,7 +251,10 @@ void CompileUnit::ResolveSymbolContext( SymbolContextItem resolve_scope, SymbolContextList &sc_list, RealpathPrefixes *realpath_prefixes) { const FileSpec file_spec = src_location_spec.GetFileSpec(); - const uint32_t line = src_location_spec.GetLine().value_or(0); + const uint32_t line = + src_location_spec.GetLine().value_or(LLDB_INVALID_LINE_NUMBER); + const uint32_t column_num = + src_location_spec.GetColumn().value_or(LLDB_INVALID_COLUMN_NUMBER); const bool check_inlines = src_location_spec.GetCheckInlines(); // First find all of the file indexes that match our "file_spec". If @@ -268,7 +271,7 @@ void CompileUnit::ResolveSymbolContext( SymbolContext sc(GetModule()); sc.comp_unit = this; - if (line == 0) { + if (line == LLDB_INVALID_LINE_NUMBER) { if (file_spec_matches_cu_file_spec && !check_inlines) { // only append the context if we aren't looking for inline call sites by // file and line and if the file spec matches that of the compile unit @@ -312,6 +315,112 @@ void CompileUnit::ResolveSymbolContext( 0, file_indexes, src_location_spec, &line_entry); } + // If we didn't manage to find a breakpoint that matched the line number + // requested, that might be because it is only an inline call site, and + // doesn't have a line entry in the line table. Scan for that here. + // + // We are making the assumption that if there was an inlined function it will + // contribute at least 1 non-call-site entry to the line table. That's handy + // because we don't move line breakpoints over function boundaries, so if we + // found a hit, and there were also a call site entry, it would have to be in + // the function containing the PC of the line table match. That way we can + // limit the call site search to that function. + // We will miss functions that ONLY exist as a call site entry. + + if (line_entry.IsValid() && + (line_entry.line != line || line_entry.column != column_num) && + resolve_scope & eSymbolContextLineEntry && check_inlines) { + // We don't move lines over function boundaries, so the address in the + // line entry will be the in function that contained the line that might + // be a CallSite, and we can just iterate over that function to find any + // inline records, and dig up their call sites. + Address start_addr = line_entry.range.GetBaseAddress(); + Function *function = start_addr.CalculateSymbolContextFunction(); + + Declaration sought_decl(file_spec, line, column_num); + // We use this recursive function to descend the block structure looking + // for a block that has this Declaration as in it's CallSite info. + // This function recursively scans the sibling blocks of the incoming + // block parameter. + std::function examine_block = + [&sought_decl, &sc_list, &src_location_spec, resolve_scope, + &examine_block](Block &block) -> void { + // Iterate over the sibling child blocks of the incoming block. + Block *sibling_block = block.GetFirstChild(); + while (sibling_block) { + // We only have to descend through the regular blocks, looking for + // immediate inlines, since those are the only ones that will have this + // callsite. + const InlineFunctionInfo *inline_info = + sibling_block->GetInlinedFunctionInfo(); + if (inline_info) { + // If this is the call-site we are looking for, record that: + // We need to be careful because the call site from the debug info + // will generally have a column, but the user might not have specified + // it. + Declaration found_decl = inline_info->GetCallSite(); + uint32_t sought_column = sought_decl.GetColumn(); + if (found_decl.FileAndLineEqual(sought_decl, false) && + (sought_column == LLDB_INVALID_COLUMN_NUMBER || + sought_column == found_decl.GetColumn())) { + // If we found a call site, it belongs not in this inlined block, + // but in the parent block that inlined it. + Address parent_start_addr; + if (sibling_block->GetParent()->GetStartAddress( + parent_start_addr)) { + SymbolContext sc; + parent_start_addr.CalculateSymbolContext(&sc, resolve_scope); + // Now swap out the line entry for the one we found. + LineEntry call_site_line = sc.line_entry; + call_site_line.line = found_decl.GetLine(); + call_site_line.column = found_decl.GetColumn(); + bool matches_spec = true; + // If the user asked for an exact match, we need to make sure the + // call site we found actually matches the location. + if (src_location_spec.GetExactMatch()) { + matches_spec = false; + if ((src_location_spec.GetFileSpec() == + sc.line_entry.GetFile()) && + (src_location_spec.GetLine() && + *src_location_spec.GetLine() == call_site_line.line) && + (src_location_spec.GetColumn() && + *src_location_spec.GetColumn() == call_site_line.column)) + matches_spec = true; + } + if (matches_spec && + sibling_block->GetRangeAtIndex(0, call_site_line.range)) { + SymbolContext call_site_sc(sc.target_sp, sc.module_sp, + sc.comp_unit, sc.function, sc.block, + &call_site_line, sc.symbol); + sc_list.Append(call_site_sc); + } + } + } + } + + // Descend into the child blocks: + examine_block(*sibling_block); + // Now go to the next sibling: + sibling_block = sibling_block->GetSibling(); + } + }; + + if (function) { + // We don't need to examine the function block, it can't be inlined. + Block &func_block = function->GetBlock(true); + examine_block(func_block); + } + // If we found entries here, we are done. We only get here because we + // didn't find an exact line entry for this line & column, but if we found + // an exact match from the call site info that's strictly better than + // continuing to look for matches further on in the file. + // FIXME: Should I also do this for "call site line exists between the + // given line number and the later line we found in the line table"? That's + // a closer approximation to our general sliding algorithm. + if (sc_list.GetSize()) + return; + } + // If "exact == true", then "found_line" will be the same as "line". If // "exact == false", the "found_line" will be the closest line entry // with a line number greater than "line" and we will use this for our diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp index 3849ec5ed178d..94a381edd5e20 100644 --- a/lldb/source/Target/StackFrameList.cpp +++ b/lldb/source/Target/StackFrameList.cpp @@ -85,121 +85,32 @@ void StackFrameList::ResetCurrentInlinedDepth() { return; std::lock_guard guard(m_mutex); - - GetFramesUpTo(0, DoNotAllowInterruption); - if (m_frames.empty()) - return; - if (!m_frames[0]->IsInlined()) { - m_current_inlined_depth = UINT32_MAX; - m_current_inlined_pc = LLDB_INVALID_ADDRESS; - Log *log = GetLog(LLDBLog::Step); - if (log && log->GetVerbose()) - LLDB_LOGF( - log, - "ResetCurrentInlinedDepth: Invalidating current inlined depth.\n"); - return; - } - // We only need to do something special about inlined blocks when we are - // at the beginning of an inlined function: - // FIXME: We probably also have to do something special if the PC is at - // the END of an inlined function, which coincides with the end of either - // its containing function or another inlined function. - - Block *block_ptr = m_frames[0]->GetFrameBlock(); - if (!block_ptr) - return; + m_current_inlined_pc = LLDB_INVALID_ADDRESS; + m_current_inlined_depth = UINT32_MAX; - Address pc_as_address; - lldb::addr_t curr_pc = m_thread.GetRegisterContext()->GetPC(); - pc_as_address.SetLoadAddress(curr_pc, &(m_thread.GetProcess()->GetTarget())); - AddressRange containing_range; - if (!block_ptr->GetRangeContainingAddress(pc_as_address, containing_range) || - pc_as_address != containing_range.GetBaseAddress()) - return; - - // If we got here because of a breakpoint hit, then set the inlined depth - // depending on where the breakpoint was set. If we got here because of a - // crash, then set the inlined depth to the deepest most block. Otherwise, - // we stopped here naturally as the result of a step, so set ourselves in the - // containing frame of the whole set of nested inlines, so the user can then - // "virtually" step into the frames one by one, or next over the whole mess. - // Note: We don't have to handle being somewhere in the middle of the stack - // here, since ResetCurrentInlinedDepth doesn't get called if there is a - // valid inlined depth set. StopInfoSP stop_info_sp = m_thread.GetStopInfo(); if (!stop_info_sp) return; - switch (stop_info_sp->GetStopReason()) { - case eStopReasonWatchpoint: - case eStopReasonException: - case eStopReasonExec: - case eStopReasonFork: - case eStopReasonVFork: - case eStopReasonVForkDone: - case eStopReasonSignal: - // In all these cases we want to stop in the deepest frame. - m_current_inlined_pc = curr_pc; - m_current_inlined_depth = 0; - break; - case eStopReasonBreakpoint: { - // FIXME: Figure out what this break point is doing, and set the inline - // depth appropriately. Be careful to take into account breakpoints that - // implement step over prologue, since that should do the default - // calculation. For now, if the breakpoints corresponding to this hit are - // all internal, I set the stop location to the top of the inlined stack, - // since that will make things like stepping over prologues work right. - // But if there are any non-internal breakpoints I do to the bottom of the - // stack, since that was the old behavior. - uint32_t bp_site_id = stop_info_sp->GetValue(); - BreakpointSiteSP bp_site_sp( - m_thread.GetProcess()->GetBreakpointSiteList().FindByID(bp_site_id)); - bool all_internal = true; - if (bp_site_sp) { - uint32_t num_owners = bp_site_sp->GetNumberOfConstituents(); - for (uint32_t i = 0; i < num_owners; i++) { - Breakpoint &bp_ref = - bp_site_sp->GetConstituentAtIndex(i)->GetBreakpoint(); - if (!bp_ref.IsInternal()) { - all_internal = false; - } - } - } - if (!all_internal) { - m_current_inlined_pc = curr_pc; - m_current_inlined_depth = 0; - break; - } - } - [[fallthrough]]; - default: { - // Otherwise, we should set ourselves at the container of the inlining, so - // that the user can descend into them. So first we check whether we have - // more than one inlined block sharing this PC: - int num_inlined_functions = 0; - - for (Block *container_ptr = block_ptr->GetInlinedParent(); - container_ptr != nullptr; - container_ptr = container_ptr->GetInlinedParent()) { - if (!container_ptr->GetRangeContainingAddress(pc_as_address, - containing_range)) - break; - if (pc_as_address != containing_range.GetBaseAddress()) - break; - num_inlined_functions++; - } - m_current_inlined_pc = curr_pc; - m_current_inlined_depth = num_inlined_functions + 1; - Log *log = GetLog(LLDBLog::Step); + bool inlined = true; + auto inline_depth = stop_info_sp->GetSuggestedStackFrameIndex(inlined); + // We're only adjusting the inlined stack here. + Log *log = GetLog(LLDBLog::Step); + if (inline_depth) { + m_current_inlined_depth = *inline_depth; + m_current_inlined_pc = m_thread.GetRegisterContext()->GetPC(); + if (log && log->GetVerbose()) LLDB_LOGF(log, "ResetCurrentInlinedDepth: setting inlined " "depth: %d 0x%" PRIx64 ".\n", - m_current_inlined_depth, curr_pc); - - break; - } + m_current_inlined_depth, m_current_inlined_pc); + } else { + if (log && log->GetVerbose()) + LLDB_LOGF( + log, + "ResetCurrentInlinedDepth: Invalidating current inlined depth.\n"); } } @@ -816,19 +727,48 @@ void StackFrameList::SelectMostRelevantFrame() { RecognizedStackFrameSP recognized_frame_sp = frame_sp->GetRecognizedFrame(); - if (!recognized_frame_sp) { - LLDB_LOG(log, "Frame #0 not recognized"); - return; + if (recognized_frame_sp) { + if (StackFrameSP most_relevant_frame_sp = + recognized_frame_sp->GetMostRelevantFrame()) { + LLDB_LOG(log, "Found most relevant frame at index {0}", + most_relevant_frame_sp->GetFrameIndex()); + SetSelectedFrame(most_relevant_frame_sp.get()); + return; + } } + LLDB_LOG(log, "Frame #0 not recognized"); - if (StackFrameSP most_relevant_frame_sp = - recognized_frame_sp->GetMostRelevantFrame()) { - LLDB_LOG(log, "Found most relevant frame at index {0}", - most_relevant_frame_sp->GetFrameIndex()); - SetSelectedFrame(most_relevant_frame_sp.get()); - } else { - LLDB_LOG(log, "No relevant frame!"); + // If this thread has a non-trivial StopInof, then let it suggest + // a most relevant frame: + StopInfoSP stop_info_sp = m_thread.GetStopInfo(); + uint32_t stack_idx = 0; + bool found_relevant = false; + if (stop_info_sp) { + // Here we're only asking the stop info if it wants to adjust the real stack + // index. We have to ask about the m_inlined_stack_depth in + // Thread::ShouldStop since the plans need to reason with that info. + bool inlined = false; + std::optional stack_opt = + stop_info_sp->GetSuggestedStackFrameIndex(inlined); + if (stack_opt) { + stack_idx = *stack_opt; + found_relevant = true; + } } + + frame_sp = GetFrameAtIndex(stack_idx); + if (!frame_sp) + LLDB_LOG(log, "Stop info suggested relevant frame {0} but it didn't exist", + stack_idx); + else if (found_relevant) + LLDB_LOG(log, "Setting selected frame from stop info to {0}", stack_idx); + // Note, we don't have to worry about "inlined" frames here, because we've + // already calculated the inlined frame in Thread::ShouldStop, and + // SetSelectedFrame will take care of that adjustment for us. + SetSelectedFrame(frame_sp.get()); + + if (!found_relevant) + LLDB_LOG(log, "No relevant frame!"); } uint32_t StackFrameList::GetSelectedFrameIndex( @@ -841,6 +781,7 @@ uint32_t StackFrameList::GetSelectedFrameIndex( // isn't set, then don't force a selection here, just return 0. if (!select_most_relevant) return 0; + // If the inlined stack frame is set, then use that: m_selected_frame_idx = 0; } return *m_selected_frame_idx; diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index 60aa65ed38c74..f6387d47504e6 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -15,6 +15,7 @@ #include "lldb/Breakpoint/WatchpointResource.h" #include "lldb/Core/Debugger.h" #include "lldb/Expression/UserExpression.h" +#include "lldb/Symbol/Block.h" #include "lldb/Target/Process.h" #include "lldb/Target/StopInfo.h" #include "lldb/Target/Target.h" @@ -246,6 +247,22 @@ class StopInfoBreakpoint : public StopInfo { return m_description.c_str(); } + std::optional + GetSuggestedStackFrameIndex(bool inlined_stack) override { + if (!inlined_stack) + return {}; + + ThreadSP thread_sp(m_thread_wp.lock()); + if (!thread_sp) + return {}; + BreakpointSiteSP bp_site_sp( + thread_sp->GetProcess()->GetBreakpointSiteList().FindByID(m_value)); + if (!bp_site_sp) + return {}; + + return bp_site_sp->GetSuggestedStackFrameIndex(); + } + protected: bool ShouldStop(Event *event_ptr) override { // This just reports the work done by PerformAction or the synchronous @@ -1164,6 +1181,44 @@ class StopInfoTrace : public StopInfo { else return m_description.c_str(); } + + std::optional + GetSuggestedStackFrameIndex(bool inlined_stack) override { + // Trace only knows how to adjust inlined stacks: + if (!inlined_stack) + return {}; + + ThreadSP thread_sp = GetThread(); + StackFrameSP frame_0_sp = thread_sp->GetStackFrameAtIndex(0); + if (!frame_0_sp) + return {}; + if (!frame_0_sp->IsInlined()) + return {}; + Block *block_ptr = frame_0_sp->GetFrameBlock(); + if (!block_ptr) + return {}; + Address pc_address = frame_0_sp->GetFrameCodeAddress(); + AddressRange containing_range; + if (!block_ptr->GetRangeContainingAddress(pc_address, containing_range) || + pc_address != containing_range.GetBaseAddress()) + return {}; + + int num_inlined_functions = 0; + + for (Block *container_ptr = block_ptr->GetInlinedParent(); + container_ptr != nullptr; + container_ptr = container_ptr->GetInlinedParent()) { + if (!container_ptr->GetRangeContainingAddress(pc_address, + containing_range)) + break; + if (pc_address != containing_range.GetBaseAddress()) + break; + + num_inlined_functions++; + } + inlined_stack = true; + return num_inlined_functions + 1; + } }; // StopInfoException diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index 8373cdc36268f..735295e6f2593 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -619,6 +619,14 @@ void Thread::WillStop() { void Thread::SetupForResume() { if (GetResumeState() != eStateSuspended) { + // First check whether this thread is going to "actually" resume at all. + // For instance, if we're stepping from one level to the next of an + // virtual inlined call stack, we just change the inlined call stack index + // without actually running this thread. In that case, for this thread we + // shouldn't push a step over breakpoint plan or do that work. + if (GetCurrentPlan()->IsVirtualStep()) + return; + // If we're at a breakpoint push the step-over breakpoint plan. Do this // before telling the current plan it will resume, since we might change // what the current plan is. diff --git a/lldb/source/Target/ThreadPlanStepInRange.cpp b/lldb/source/Target/ThreadPlanStepInRange.cpp index 567dcc26d0d37..224a17d896ccf 100644 --- a/lldb/source/Target/ThreadPlanStepInRange.cpp +++ b/lldb/source/Target/ThreadPlanStepInRange.cpp @@ -41,7 +41,7 @@ ThreadPlanStepInRange::ThreadPlanStepInRange( "Step Range stepping in", thread, range, addr_context, stop_others), ThreadPlanShouldStopHere(this), m_step_past_prologue(true), - m_virtual_step(false), m_step_into_target(step_into_target) { + m_virtual_step(eLazyBoolCalculate), m_step_into_target(step_into_target) { SetCallbacks(); SetFlagsToDefault(); SetupAvoidNoDebug(step_in_avoids_code_without_debug_info, @@ -149,7 +149,7 @@ bool ThreadPlanStepInRange::ShouldStop(Event *event_ptr) { m_sub_plan_sp.reset(); } - if (m_virtual_step) { + if (m_virtual_step == eLazyBoolYes) { // If we've just completed a virtual step, all we need to do is check for a // ShouldStopHere plan, and otherwise we're done. // FIXME - This can be both a step in and a step out. Probably should @@ -431,7 +431,7 @@ bool ThreadPlanStepInRange::DoPlanExplainsStop(Event *event_ptr) { bool return_value = false; - if (m_virtual_step) { + if (m_virtual_step == eLazyBoolYes) { return_value = true; } else { StopInfoSP stop_info_sp = GetPrivateStopInfo(); @@ -460,10 +460,13 @@ bool ThreadPlanStepInRange::DoPlanExplainsStop(Event *event_ptr) { bool ThreadPlanStepInRange::DoWillResume(lldb::StateType resume_state, bool current_plan) { - m_virtual_step = false; + m_virtual_step = eLazyBoolCalculate; if (resume_state == eStateStepping && current_plan) { Thread &thread = GetThread(); // See if we are about to step over a virtual inlined call. + // But if we already know we're virtual stepping, don't decrement the + // inlined depth again... + bool step_without_resume = thread.DecrementCurrentInlinedDepth(); if (step_without_resume) { Log *log = GetLog(LLDBLog::Step); @@ -476,11 +479,21 @@ bool ThreadPlanStepInRange::DoWillResume(lldb::StateType resume_state, // FIXME: Maybe it would be better to create a InlineStep stop reason, but // then // the whole rest of the world would have to handle that stop reason. - m_virtual_step = true; + m_virtual_step = eLazyBoolYes; } return !step_without_resume; } return true; } -bool ThreadPlanStepInRange::IsVirtualStep() { return m_virtual_step; } +bool ThreadPlanStepInRange::IsVirtualStep() { + if (m_virtual_step == eLazyBoolCalculate) { + Thread &thread = GetThread(); + uint32_t cur_inline_depth = thread.GetCurrentInlinedDepth(); + if (cur_inline_depth == UINT32_MAX || cur_inline_depth == 0) + m_virtual_step = eLazyBoolNo; + else + m_virtual_step = eLazyBoolYes; + } + return m_virtual_step == eLazyBoolYes; +} diff --git a/lldb/source/Target/ThreadPlanStepOverRange.cpp b/lldb/source/Target/ThreadPlanStepOverRange.cpp index ef5b4b5c434d1..643ee827c865c 100644 --- a/lldb/source/Target/ThreadPlanStepOverRange.cpp +++ b/lldb/source/Target/ThreadPlanStepOverRange.cpp @@ -402,7 +402,7 @@ bool ThreadPlanStepOverRange::DoWillResume(lldb::StateType resume_state, if (in_inlined_stack) { Log *log = GetLog(LLDBLog::Step); LLDB_LOGF(log, - "ThreadPlanStepInRange::DoWillResume: adjusting range to " + "ThreadPlanStepOverRange::DoWillResume: adjusting range to " "the frame at inlined depth %d.", thread.GetCurrentInlinedDepth()); StackFrameSP stack_sp = thread.GetStackFrameAtIndex(0); diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py index 0919eb3c5dd81..93d5392830b50 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py @@ -40,5 +40,3 @@ def test(self): "frame variable ils", substrs=['[4] = "surprise it is a long string!! yay!!"'], ) - - self.expect("image list", substrs=self.getLibcPlusPlusLibs()) diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py index 5eb3fc3cada92..08ac9290ee85a 100644 --- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py @@ -132,12 +132,39 @@ def test_read_registers_using_g_packets(self): target = self.createTarget("a.yaml") process = self.connect(target) - self.assertEqual(1, self.server.responder.packetLog.count("g")) - self.server.responder.packetLog = [] + # We want to make sure that the process is using the g packet, but it's + # not required the "connect" should read all registers. However, it might + # have... So we need to wait till we explicitly 'read_registers' to do + # test. + # Also, even with the use-g-packet-for-reading lldb will sometimes send p0 + # early on to see if the packet is supported. So we can't say that there + # will be NO p packets. + # But there certainly should be no p packets after the g packet. + self.read_registers(process) - # Reading registers should not cause any 'p' packets to be exchanged. + print(f"\nPACKET LOG:\n{self.server.responder.packetLog}\n") + g_pos = 0 + try: + g_pos = self.server.responder.packetLog.index("g") + except err: + self.fail("'g' packet not found after fetching registers") + + try: + second_g = self.server.responder.packetLog.index("g", g_pos) + self.fail("Found more than one 'g' packet") + except: + pass + + # Make sure there aren't any `p` packets after the `g` packet: self.assertEqual( - 0, len([p for p in self.server.responder.packetLog if p.startswith("p")]) + 0, + len( + [ + p + for p in self.server.responder.packetLog[g_pos:] + if p.startswith("p") + ] + ), ) def test_read_registers_using_p_packets(self): diff --git a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py index 752c3a9cbd286..4e2d908e63b81 100644 --- a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py +++ b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py @@ -1,6 +1,5 @@ """Test stepping over and into inlined functions.""" - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -14,6 +13,7 @@ class TestInlineStepping(TestBase): compiler="icc", bugnumber="# Not really a bug. ICC combines two inlined functions.", ) + @skipIf(oslist=["linux"], archs=["arm"]) # Fails for 32 bit arm def test_with_python_api(self): """Test stepping over and into inlined functions.""" self.build() @@ -32,6 +32,12 @@ def test_step_in_template_with_python_api(self): self.build() self.step_in_template() + @add_test_categories(["pyapi"]) + def test_virtual_inline_stepping(self): + """Test stepping through a virtual inlined call stack""" + self.build() + self.virtual_inline_stepping() + def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -357,3 +363,76 @@ def step_in_template(self): step_sequence = [["// In max_value specialized", "into"]] self.run_step_sequence(step_sequence) + + def run_to_call_site_and_step( + self, source_regex, func_name, start_pos, one_more_step_loc=None + ): + main_spec = lldb.SBFileSpec("calling.cpp") + # Set the breakpoint by file and line, not sourced regex because + # we want to make sure we can set breakpoints on call sites: + call_site_line_num = line_number(self.main_source, source_regex) + target, process, thread, bkpt = lldbutil.run_to_line_breakpoint( + self, main_spec, call_site_line_num + ) + + # Make sure that the location is at the call site (run_to_line_breakpoint already asserted + # that there's one location.): + bkpt_loc = bkpt.location[0] + strm = lldb.SBStream() + result = bkpt_loc.GetDescription(strm, lldb.eDescriptionLevelFull) + + self.assertTrue(result, "Got a location description") + desc = strm.GetData() + self.assertIn(f"calling.cpp:{call_site_line_num}", desc, "Right line listed") + # We don't get the function name right yet - so we omit it in printing. + # Turn on this test when that is working. + # self.assertIn(func_name, desc, "Right function listed") + + pc = thread.frame[0].pc + for i in range(start_pos, 3): + thread.StepInto() + frame_0 = thread.frame[0] + + trivial_line_num = line_number( + self.main_source, f"In caller_trivial_inline_{i}." + ) + self.assertEqual( + frame_0.line_entry.line, + trivial_line_num, + f"Stepped into the caller_trivial_inline_{i}", + ) + if pc != frame_0.pc: + # If we get here, we stepped to the expected line number, but + # the compiler on this system has decided to insert an instruction + # between the call site of an inlined function with no arguments, + # returning void, and its immediate call to another void inlined function + # with no arguments. We aren't going to be testing virtual inline + # stepping for this function... + break + + if one_more_step_loc: + thread.StepInto() + frame_0 = thread.frame[0] + self.assertEqual( + frame_0.line_entry.line, + line_number(self.main_source, one_more_step_loc), + "Was able to step one more time", + ) + process.Kill() + target.Clear() + + def virtual_inline_stepping(self): + """Use the Python API's to step through a virtual inlined stack""" + self.run_to_call_site_and_step("At caller_trivial_inline_1", "main", 1) + self.run_to_call_site_and_step( + "In caller_trivial_inline_1", "caller_trivial_inline_1", 2 + ) + self.run_to_call_site_and_step( + "In caller_trivial_inline_2", "caller_trivial_inline_2", 3 + ) + self.run_to_call_site_and_step( + "In caller_trivial_inline_3", + "caller_trivial_inline_3", + 4, + "After caller_trivial_inline_3", + ) diff --git a/lldb/test/API/functionalities/inline-stepping/calling.cpp b/lldb/test/API/functionalities/inline-stepping/calling.cpp index 49179ce7c9788..ba71c25a3c648 100644 --- a/lldb/test/API/functionalities/inline-stepping/calling.cpp +++ b/lldb/test/API/functionalities/inline-stepping/calling.cpp @@ -13,6 +13,12 @@ int called_by_inline_ref (int &value); inline void inline_trivial_1 () __attribute__((always_inline)); inline void inline_trivial_2 () __attribute__((always_inline)); +// These three should share the same initial pc so we can test +// virtual inline stepping. +inline void caller_trivial_inline_1() __attribute__((always_inline)); +inline void caller_trivial_inline_2() __attribute__((always_inline)); +inline void caller_trivial_inline_3() __attribute__((always_inline)); + void caller_trivial_1 (); void caller_trivial_2 (); @@ -79,6 +85,23 @@ caller_trivial_2 () inline_value += 1; // At increment in caller_trivial_2. } +// When you call caller_trivial_inline_1, the inlined call-site +// should share a PC with all three of the following inlined +// functions, so we can exercise "virtual inline stepping". +void caller_trivial_inline_1() { + caller_trivial_inline_2(); // In caller_trivial_inline_1. + inline_value += 1; +} + +void caller_trivial_inline_2() { + caller_trivial_inline_3(); // In caller_trivial_inline_2. + inline_value += 1; // After caller_trivial_inline_3 +} + +void caller_trivial_inline_3() { + inline_value += 1; // In caller_trivial_inline_3. +} + void called_by_inline_trivial () { @@ -132,5 +155,7 @@ main (int argc, char **argv) max_value(123, 456); // Call max_value template max_value(std::string("abc"), std::string("0022")); // Call max_value specialized + caller_trivial_inline_1(); // At caller_trivial_inline_1. + return 0; // About to return from main. } diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py index ad48208f21e50..5cc43f3cd9910 100644 --- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py @@ -3,11 +3,13 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil +import re class LibCxxInternalsRecognizerTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True @add_test_categories(["libc++"]) + @skipIf(compiler="clang", compiler_version=["<", "16.0"]) def test_frame_recognizer(self): """Test that implementation details of libc++ are hidden""" self.build() @@ -21,7 +23,7 @@ def test_frame_recognizer(self): # We never hide the frame of the entry-point into the standard library, even # if the name starts with `__` which usually indicates an internal function. "ranges_sort_less(int, int)": [ - "ranges::__sort::operator()", + re.compile("ranges::__sort::(__fn::)?operator\(\)"), "test_algorithms", ], # `ranges::views::transform` internally uses `std::invoke`, and that @@ -57,9 +59,14 @@ def test_frame_recognizer(self): ): frame_id = frame_id + 1 # Expect the correct parent frame - self.assertIn( - expected_parent, thread.GetFrameAtIndex(frame_id).GetFunctionName() - ) + func_name = thread.GetFrameAtIndex(frame_id).GetFunctionName() + if isinstance(expected_parent, re.Pattern): + self.assertTrue( + expected_parent.search(func_name) is not None, + f"'{expected_parent}' not found in '{func_name}'" + ) + else: + self.assertIn(expected_parent, func_name) frame_id = frame_id + 1 process.Continue() diff --git a/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py b/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py index b5e8115160d20..41141164769ec 100644 --- a/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py +++ b/lldb/test/API/lang/cpp/namespace/TestNamespaceLookup.py @@ -8,7 +8,7 @@ from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil - +from lldbsuite.test import lldbplatformutil class NamespaceLookupTestCase(TestBase): def setUp(self): @@ -167,7 +167,10 @@ def test_scope_lookup_with_run_command(self): self.runToBkpt("continue") # FIXME: In DWARF 5 with dsyms, the ordering of functions is slightly # different, which also hits the same issues mentioned previously. - if configuration.dwarf_version <= 4 or self.getDebugInfo() == "dwarf": + if ( + int(lldbplatformutil.getDwarfVersion()) <= 4 + or self.getDebugInfo() == "dwarf" + ): self.expect_expr("func()", result_type="int", result_value="2") # Continue to BP_ns_scope at ns scope diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index 6481ae8b663c8..c8e4a4c461f12 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -268,12 +268,6 @@ def delete_module_cache(path): if is_configured("lldb_framework_dir"): dotest_cmd += ["--framework", config.lldb_framework_dir] -if ( - "lldb-repro-capture" in config.available_features - or "lldb-repro-replay" in config.available_features -): - dotest_cmd += ["--skip-category=lldb-dap", "--skip-category=std-module"] - if "lldb-simulator-ios" in config.available_features: dotest_cmd += ["--apple-sdk", "iphonesimulator", "--platform-name", "ios-simulator"] elif "lldb-simulator-watchos" in config.available_features: diff --git a/lldb/test/API/python_api/process/io/TestProcessIO.py b/lldb/test/API/python_api/process/io/TestProcessIO.py index 3b5c7c48c51f4..5d9727add399b 100644 --- a/lldb/test/API/python_api/process/io/TestProcessIO.py +++ b/lldb/test/API/python_api/process/io/TestProcessIO.py @@ -99,31 +99,38 @@ def test_stdout_stderr_redirection(self): @expectedFlakeyLinux(bugnumber="llvm.org/pr26437") @skipIfDarwinEmbedded # debugserver can't create/write files on the device def test_stdout_stderr_redirection_to_existing_files(self): - """Exercise SBLaunchInfo::AddOpenFileAction() for STDOUT and STDERR without redirecting STDIN to output files already exist.""" + """Exercise SBLaunchInfo::AddOpenFileAction() for STDOUT and STDERR redirect to output files already exist.""" self.setup_test() self.build() self.create_target() - self.write_file_with_placeholder(self.output_file) - self.write_file_with_placeholder(self.error_file) - self.redirect_stdout() - self.redirect_stderr() - self.run_process(True) - output = self.read_output_file_and_delete() - error = self.read_error_file_and_delete() - self.check_process_output(output, error) - def write_file_with_placeholder(self, target_file): + # Create the output and error files with placeholder placeholder = "This content should be overwritten." + # Local file directory and working directory are the same for local debugging + f = open(self.local_output_file, "w") + f.write(placeholder) + f.close() + f = open(self.local_error_file, "w") + f.write(placeholder) + f.close() if lldb.remote_platform: self.runCmd( - 'platform file write "{target}" -d "{data}"'.format( - target=target_file, data=placeholder + 'platform put-file "{local}" "{remote}"'.format( + local=self.local_output_file, remote=self.output_file + ) + ) + self.runCmd( + 'platform put-file "{local}" "{remote}"'.format( + local=self.local_error_file, remote=self.error_file ) ) - else: - f = open(target_file, "w") - f.write(placeholder) - f.close() + + self.redirect_stdout() + self.redirect_stderr() + self.run_process(True) + output = self.read_output_file_and_delete() + error = self.read_error_file_and_delete() + self.check_process_output(output, error) # target_file - path on local file system or remote file system if running remote # local_file - path on local system diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py index bc4d00c17c555..09879276b44aa 100644 --- a/lldb/test/API/python_api/type/TestTypeList.py +++ b/lldb/test/API/python_api/type/TestTypeList.py @@ -6,7 +6,7 @@ from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil - +from lldbsuite.test import lldbplatformutil class TypeAndTypeListTestCase(TestBase): def setUp(self): @@ -248,7 +248,7 @@ def test(self): self.assertEqual(myint_arr_element_type, myint_type) # Test enum methods. Requires DW_AT_enum_class which was added in Dwarf 4. - if configuration.dwarf_version >= 4: + if int(lldbplatformutil.getDwarfVersion()) >= 4: enum_type = target.FindFirstType("EnumType") self.assertTrue(enum_type) self.DebugSBType(enum_type) diff --git a/lldb/test/Shell/Driver/LocalLLDBInit.test b/lldb/test/Shell/Driver/LocalLLDBInit.test index e1b66a0998444..5db545e7ec561 100644 --- a/lldb/test/Shell/Driver/LocalLLDBInit.test +++ b/lldb/test/Shell/Driver/LocalLLDBInit.test @@ -1,5 +1,4 @@ # REQUIRES: python -# UNSUPPORTED: lldb-repro # # RUN: mkdir -p %t.root # RUN: mkdir -p %t.home diff --git a/lldb/test/Shell/Driver/TestCore.test b/lldb/test/Shell/Driver/TestCore.test index 2472617235124..cca8171da6312 100644 --- a/lldb/test/Shell/Driver/TestCore.test +++ b/lldb/test/Shell/Driver/TestCore.test @@ -1,4 +1,2 @@ -# UNSUPPORTED: lldb-repro -# # RUN: not %lldb -c /bogus/path 2>&1 | FileCheck %s # CHECK: error: file specified in --core (-c) option doesn't exist diff --git a/lldb/test/Shell/Driver/TestError.test b/lldb/test/Shell/Driver/TestError.test index 141c3ddf0f5f3..3d34a72b14aba 100644 --- a/lldb/test/Shell/Driver/TestError.test +++ b/lldb/test/Shell/Driver/TestError.test @@ -1,3 +1,2 @@ -UNSUPPORTED: lldb-repro RUN: not %lldb --arch 2>&1 | FileCheck %s CHECK: error: argument to '--arch' is missing diff --git a/lldb/test/Shell/Driver/TestFile.test b/lldb/test/Shell/Driver/TestFile.test index 776baf8ba0c5e..0e80594aeb1b5 100644 --- a/lldb/test/Shell/Driver/TestFile.test +++ b/lldb/test/Shell/Driver/TestFile.test @@ -1,4 +1,2 @@ -# UNSUPPORTED: lldb-repro -# # RUN: not %lldb -f /bogus/path 2>&1 | FileCheck %s # CHECK: error: file specified in --file (-f) option doesn't exist diff --git a/lldb/test/Shell/Driver/TestHelp.test b/lldb/test/Shell/Driver/TestHelp.test index 2521b31a61883..703000b6452e9 100644 --- a/lldb/test/Shell/Driver/TestHelp.test +++ b/lldb/test/Shell/Driver/TestHelp.test @@ -1,5 +1,3 @@ -UNSUPPORTED: lldb-repro - RUN: %lldb --help | FileCheck %s RUN: cat %S/../../../docs/man/lldb.rst | FileCheck %s diff --git a/lldb/test/Shell/Driver/TestPositionalArgs.test b/lldb/test/Shell/Driver/TestPositionalArgs.test index b4fa48f3af38f..0dbad97e888ca 100644 --- a/lldb/test/Shell/Driver/TestPositionalArgs.test +++ b/lldb/test/Shell/Driver/TestPositionalArgs.test @@ -1,5 +1,3 @@ -UNSUPPORTED: lldb-repro - RUN: echo "int main() { return 0; }" | %clang_host -x c - -o %t.foo RUN: %lldb -x -b %t.foo bar baz quux | FileCheck %s diff --git a/lldb/test/Shell/Driver/TestRepl.test b/lldb/test/Shell/Driver/TestRepl.test index a0bf8c26fd575..083863985a14b 100644 --- a/lldb/test/Shell/Driver/TestRepl.test +++ b/lldb/test/Shell/Driver/TestRepl.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # RUN: echo ':quit' | %lldb -x --repl -O 'expr 42' -S %S/Inputs/Print2.in -o 'expr 999999' -s %s 2>&1 | FileCheck %s # CHECK: {{w}}arning: commands specified to run after file load (via -o or -s) are ignored in REPL mode # CHECK: (int) $0 = 42 diff --git a/lldb/test/Shell/Process/TestEnvironment.test b/lldb/test/Shell/Process/TestEnvironment.test index e6d6e56fc9203..a9c624b8a4ec8 100644 --- a/lldb/test/Shell/Process/TestEnvironment.test +++ b/lldb/test/Shell/Process/TestEnvironment.test @@ -1,5 +1,4 @@ UNSUPPORTED: system-windows -UNSUPPORTED: lldb-repro The double quotes around "BAR" ensure we don't match the command. diff --git a/lldb/test/Shell/Quit/TestQuitExitCode-30.test b/lldb/test/Shell/Quit/TestQuitExitCode-30.test index b0b02bdf70041..2f15398c7614e 100644 --- a/lldb/test/Shell/Quit/TestQuitExitCode-30.test +++ b/lldb/test/Shell/Quit/TestQuitExitCode-30.test @@ -1,4 +1,3 @@ # UNSUPPORTED: system-windows -# UNSUPPORTED: lldb-repro # RUN: %python %S/expect_exit_code.py 226 %lldb -b -s %s q -30 diff --git a/lldb/test/Shell/Quit/TestQuitExitCode30.test b/lldb/test/Shell/Quit/TestQuitExitCode30.test index 92ad3c6d1fe48..e5ff634e71367 100644 --- a/lldb/test/Shell/Quit/TestQuitExitCode30.test +++ b/lldb/test/Shell/Quit/TestQuitExitCode30.test @@ -1,4 +1,3 @@ # UNSUPPORTED: system-windows -# UNSUPPORTED: lldb-repro # RUN: %python %S/expect_exit_code.py 30 %lldb -b -s %s q 30 diff --git a/lldb/test/Shell/Quit/TestQuitExitCodeHexA.test b/lldb/test/Shell/Quit/TestQuitExitCodeHexA.test index 59b7103ad0863..ca0e2d5acc3bc 100644 --- a/lldb/test/Shell/Quit/TestQuitExitCodeHexA.test +++ b/lldb/test/Shell/Quit/TestQuitExitCodeHexA.test @@ -1,4 +1,3 @@ # UNSUPPORTED: system-windows -# UNSUPPORTED: lldb-repro # RUN: %python %S/expect_exit_code.py 10 %lldb -b -s %s q 0xA diff --git a/lldb/test/Shell/Register/x86-64-read.test b/lldb/test/Shell/Register/x86-64-read.test index ac3d4d1e27249..fc093190c2560 100644 --- a/lldb/test/Shell/Register/x86-64-read.test +++ b/lldb/test/Shell/Register/x86-64-read.test @@ -1,4 +1,3 @@ -# UNSUPPORTED: lldb-repro # XFAIL: system-windows # REQUIRES: native && target-x86_64 # RUN: %clangxx_host %p/Inputs/x86-64-read.cpp -o %t diff --git a/lldb/test/Shell/Register/x86-64-ymm-read.test b/lldb/test/Shell/Register/x86-64-ymm-read.test index be9133b77d42c..0d01b0937f1d1 100644 --- a/lldb/test/Shell/Register/x86-64-ymm-read.test +++ b/lldb/test/Shell/Register/x86-64-ymm-read.test @@ -1,4 +1,3 @@ -# UNSUPPORTED: lldb-repro # XFAIL: system-windows # REQUIRES: native && target-x86_64 && native-cpu-avx # RUN: %clangxx_host %p/Inputs/x86-ymm-read.cpp -o %t diff --git a/lldb/test/Shell/Register/x86-multithread-write.test b/lldb/test/Shell/Register/x86-multithread-write.test index 30273a6609dc8..cc02b323c7263 100644 --- a/lldb/test/Shell/Register/x86-multithread-write.test +++ b/lldb/test/Shell/Register/x86-multithread-write.test @@ -1,7 +1,6 @@ # XFAIL: system-windows # REQUIRES: native && (target-x86 || target-x86_64) # UNSUPPORTED: system-debugserver -# UNSUPPORTED: lldb-repro # RUN: %clangxx_host %p/Inputs/x86-multithread-write.cpp -o %t -pthread # RUN: %lldb -b -s %s %t | FileCheck %s diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/bindings.test b/lldb/test/Shell/ScriptInterpreter/Lua/bindings.test index d453f11f1ec76..f53f46b54da2a 100644 --- a/lldb/test/Shell/ScriptInterpreter/Lua/bindings.test +++ b/lldb/test/Shell/ScriptInterpreter/Lua/bindings.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # RUN: cat %s | %lldb --script-language lua 2>&1 | FileCheck %s script debugger = lldb.SBDebugger.Create() diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test b/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test index 6ebcb953a96e0..a76378b9aa785 100644 --- a/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test +++ b/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # This tests that the convenience variables are not nil. Given that there is no # target we only expect the debugger to be valid. # diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/io.test b/lldb/test/Shell/ScriptInterpreter/Lua/io.test index 80b3b13cdd1de..f9ef84e957c53 100644 --- a/lldb/test/Shell/ScriptInterpreter/Lua/io.test +++ b/lldb/test/Shell/ScriptInterpreter/Lua/io.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # RUN: rm -rf %t.stderr %t.stdout # RUN: cat %s | %lldb --script-language lua 2> %t.stderr > %t.stdout # RUN: cat %t.stdout | FileCheck %s --check-prefix STDOUT diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/lua-python.test b/lldb/test/Shell/ScriptInterpreter/Lua/lua-python.test index 38b4986a66730..b81bddc25177a 100644 --- a/lldb/test/Shell/ScriptInterpreter/Lua/lua-python.test +++ b/lldb/test/Shell/ScriptInterpreter/Lua/lua-python.test @@ -1,5 +1,4 @@ # REQUIRES: python -# UNSUPPORTED: lldb-repro # RUN: mkdir -p %t # RUN: cd %t diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/print.test b/lldb/test/Shell/ScriptInterpreter/Lua/print.test index f73d100459db5..9df24e8026507 100644 --- a/lldb/test/Shell/ScriptInterpreter/Lua/print.test +++ b/lldb/test/Shell/ScriptInterpreter/Lua/print.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # RUN: rm -rf %t.stderr %t.stdout # RUN: cat %s | %lldb --script-language lua 2> %t.stderr > %t.stdout # RUN: cat %t.stdout | FileCheck %s --check-prefix STDOUT diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/quit.test b/lldb/test/Shell/ScriptInterpreter/Lua/quit.test index 0cef4c2f2c177..d66e17e25c73c 100644 --- a/lldb/test/Shell/ScriptInterpreter/Lua/quit.test +++ b/lldb/test/Shell/ScriptInterpreter/Lua/quit.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # RUN: cat %s | %lldb --script-language lua 2>&1 | FileCheck %s script print(95000 + 126) diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/lit.local.cfg b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/lit.local.cfg index b72b294198931..52bffaeac372b 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/lit.local.cfg +++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/lit.local.cfg @@ -1,9 +1,6 @@ if 'system-darwin' not in config.available_features: config.unsupported = True -if 'lldb-repro' in config.available_features: - config.unsupported = True - config.environment["LLDB_APPLE_DSYMFORUUID_EXECUTABLE"] = "" # Temporary parallel image loading deadlock workaround diff --git a/lldb/test/Shell/ScriptInterpreter/Python/fail_breakpoint_oneline.test b/lldb/test/Shell/ScriptInterpreter/Python/fail_breakpoint_oneline.test index 67244641cf97f..d661e0a05f0fb 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/fail_breakpoint_oneline.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/fail_breakpoint_oneline.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # RUN: %lldb -s %s --script-language python 2>&1 | FileCheck %s b main breakpoint command add -s python -o "1234_foo" diff --git a/lldb/test/Shell/ScriptInterpreter/Python/sb_address_exception.test b/lldb/test/Shell/ScriptInterpreter/Python/sb_address_exception.test index 9b8def211966b..9ce9837c8f278 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/sb_address_exception.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/sb_address_exception.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # Test that the SBAddress properties throw an exception when used outside of # the interactive script interpreter. # diff --git a/lldb/test/Shell/ScriptInterpreter/Python/scripted_breakpoint.test b/lldb/test/Shell/ScriptInterpreter/Python/scripted_breakpoint.test index 9f87a7d96e099..f6fff5f51de8e 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/scripted_breakpoint.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/scripted_breakpoint.test @@ -1,5 +1,3 @@ -# UNSUPPORTED: lldb-repro -# # Test that the scripting language argument to "breakpoint command" is honored # even if the global scripting language is different. # diff --git a/lldb/test/Shell/ScriptInterpreter/Python/scripted_breakpoint_lua.test b/lldb/test/Shell/ScriptInterpreter/Python/scripted_breakpoint_lua.test index c86ae9057c502..5346d4ae32b99 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/scripted_breakpoint_lua.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/scripted_breakpoint_lua.test @@ -1,5 +1,4 @@ # REQUIRES: python -# UNSUPPORTED: lldb-repro # # RUN: cat %s | %lldb --script-language lua 2>&1 | FileCheck %s b main diff --git a/lldb/test/Shell/Subprocess/lit.local.cfg b/lldb/test/Shell/Subprocess/lit.local.cfg deleted file mode 100644 index c9b378b7a8a5a..0000000000000 --- a/lldb/test/Shell/Subprocess/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if 'lldb-repro' in config.available_features: - config.unsupported = True diff --git a/lldb/test/Shell/SymbolFile/DWARF/debug-types-mangled-name.ll b/lldb/test/Shell/SymbolFile/DWARF/debug-types-mangled-name.ll new file mode 100644 index 0000000000000..06dd817fc0f39 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/debug-types-mangled-name.ll @@ -0,0 +1,63 @@ +; Test finding types by CompilerContext. +; REQUIRES: aarch64 +; RUN: llc %s -filetype=obj -o %t.o +; RUN: lldb-test symbols %t.o -find=type --mangled-name=UniqueDifferentName | FileCheck %s +; +; NORESULTS: Found 0 types +; CHECK: Found 1 types: +; CHECK: struct DifferentName { +; CHECK-NEXT: int i; +; CHECK-NEXT: } + +source_filename = "t.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64-unknown-linux-gnu" + +%struct.SameName = type { i32 } +%struct.DifferentName = type { i32 } + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @main() #0 !dbg !10 { +entry: + %retval = alloca i32, align 4 + %s = alloca %struct.SameName, align 4 + %d = alloca %struct.DifferentName, align 4 + store i32 0, ptr %retval, align 4 + #dbg_declare(ptr %s, !16, !DIExpression(), !20) + #dbg_declare(ptr %d, !21, !DIExpression(), !25) + ret i32 0, !dbg !26 +} + +attributes #0 = { noinline optnone } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} +!llvm.ident = !{!9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "t.c", directory: "/") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 1} +!9 = !{!""} +!10 = distinct !DISubprogram(name: "main", scope: !11, file: !11, line: 9, type: !12, scopeLine: 9, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !15) +!11 = !DIFile(filename: "t.c", directory: "") +!12 = !DISubroutineType(types: !13) +!13 = !{!14} +!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!15 = !{} +!16 = !DILocalVariable(name: "s", scope: !10, file: !11, line: 10, type: !17) +!17 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "SameName", file: !11, line: 1, size: 32, elements: !18, runtimeLang: DW_LANG_Swift, identifier: "SameName") +!18 = !{!19} +!19 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !17, file: !11, line: 2, baseType: !14, size: 32) +!20 = !DILocation(line: 10, column: 19, scope: !10) +!21 = !DILocalVariable(name: "d", scope: !10, file: !11, line: 11, type: !22) +!22 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DifferentName", file: !11, line: 5, size: 32, elements: !23, runtimeLang: DW_LANG_Swift, identifier: "UniqueDifferentName") +!23 = !{!24} +!24 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !22, file: !11, line: 6, baseType: !14, size: 32) +!25 = !DILocation(line: 11, column: 24, scope: !10) +!26 = !DILocation(line: 12, column: 3, scope: !10) diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug_loc.s b/lldb/test/Shell/SymbolFile/DWARF/x86/debug_loc.s index f8e8bfba970f0..427db9797add4 100644 --- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug_loc.s +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug_loc.s @@ -2,8 +2,6 @@ # behavior in the invalid cases is not particularly important, but it should be # "reasonable". -# UNSUPPORTED: lldb-repro - # RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s --defsym LOC=0 > %t # RUN: %lldb %t -o "image lookup -v -a 0" -o "image lookup -v -a 2" \ # RUN: -o "image dump symfile" -o exit | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/ast-functions-msvc.cpp b/lldb/test/Shell/SymbolFile/NativePDB/ast-functions-msvc.cpp index b8154168aff3d..c0ae6e73f36d8 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/ast-functions-msvc.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/ast-functions-msvc.cpp @@ -3,5 +3,5 @@ // RUN: %build --compiler=msvc --nodefaultlib -o %t.exe -- %S/ast-functions.cpp -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/ast-functions.lldbinit 2>&1 | FileCheck %S/ast-functions.cpp diff --git a/lldb/test/Shell/SymbolFile/NativePDB/ast-functions.cpp b/lldb/test/Shell/SymbolFile/NativePDB/ast-functions.cpp index 7eb7a2cbe7d9a..d1cac393bbed9 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/ast-functions.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/ast-functions.cpp @@ -4,8 +4,7 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ -// RUN: %p/Inputs/ast-functions.lldbinit 2>&1 | FileCheck %s +// RUN: %lldb -f %t.exe -s %p/Inputs/ast-functions.lldbinit 2>&1 | FileCheck %s static int static_fn() { return 42; diff --git a/lldb/test/Shell/SymbolFile/NativePDB/ast-methods.cpp b/lldb/test/Shell/SymbolFile/NativePDB/ast-methods.cpp index f2be33aae8163..91bd5bb810c8e 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/ast-methods.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/ast-methods.cpp @@ -3,10 +3,10 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -GR- -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/ast-methods.lldbinit 2>&1 | FileCheck %s --check-prefix=AST -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols --dump-ast %t.exe | FileCheck %s --check-prefix=SYMBOL +// RUN: lldb-test symbols --dump-ast %t.exe | FileCheck %s --check-prefix=SYMBOL struct Struct { void simple_method() {} diff --git a/lldb/test/Shell/SymbolFile/NativePDB/ast-types.cpp b/lldb/test/Shell/SymbolFile/NativePDB/ast-types.cpp index 5554881716184..ac0d87e95dbf9 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/ast-types.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/ast-types.cpp @@ -4,7 +4,7 @@ // Test various interesting cases for AST reconstruction. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/ast-types.lldbinit 2>&1 | FileCheck %s // Test trivial versions of each tag type. diff --git a/lldb/test/Shell/SymbolFile/NativePDB/bitfields.cpp b/lldb/test/Shell/SymbolFile/NativePDB/bitfields.cpp index bfa3cbc819472..72085f019e4d8 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/bitfields.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/bitfields.cpp @@ -4,7 +4,7 @@ // Test various interesting cases for AST reconstruction. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/bitfields.lldbinit 2>&1 | FileCheck %s // Test trivial versions of each tag type. diff --git a/lldb/test/Shell/SymbolFile/NativePDB/blocks.s b/lldb/test/Shell/SymbolFile/NativePDB/blocks.s index 7a124702e4757..1f753a22372c3 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/blocks.s +++ b/lldb/test/Shell/SymbolFile/NativePDB/blocks.s @@ -4,7 +4,7 @@ // Test block range is set. // RUN: llvm-mc -triple=x86_64-windows-msvc --filetype=obj %s > %t.obj // RUN: lld-link /debug:full /nodefaultlib /entry:main %t.obj /out:%t.exe /base:0x140000000 -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t.exe -o "image lookup -a 0x140001014 -v" -o "exit" | FileCheck %s +// RUN: %lldb %t.exe -o "image lookup -a 0x140001014 -v" -o "exit" | FileCheck %s // CHECK: Function: id = {{.*}}, name = "main", range = [0x0000000140001000-0x0000000140001044) // CHECK-NEXT: FuncType: id = {{.*}}, byte-size = 0, compiler_type = "int (void)" diff --git a/lldb/test/Shell/SymbolFile/NativePDB/break-by-function.cpp b/lldb/test/Shell/SymbolFile/NativePDB/break-by-function.cpp index 1768f127c9fa4..a580d574a9ca3 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/break-by-function.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/break-by-function.cpp @@ -4,7 +4,7 @@ // Test that we can set simple breakpoints using PDB on any platform. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/break-by-function.lldbinit | FileCheck %s // Use different indentation style for each overload so that the starting diff --git a/lldb/test/Shell/SymbolFile/NativePDB/break-by-line.cpp b/lldb/test/Shell/SymbolFile/NativePDB/break-by-line.cpp index ebb7114b985e6..90ac633b01632 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/break-by-line.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/break-by-line.cpp @@ -4,7 +4,7 @@ // Test that we can set simple breakpoints using PDB on any platform. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/break-by-line.lldbinit | FileCheck %s // This is a separate test from break-by-function.cpp because this test is diff --git a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp index 0941dc7c51d01..36bfdb9a8e565 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp @@ -4,7 +4,7 @@ // Make sure class layout is correct. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/class_layout.lldbinit 2>&1 | FileCheck %s // CHECK: (lldb) expr a diff --git a/lldb/test/Shell/SymbolFile/NativePDB/disassembly.cpp b/lldb/test/Shell/SymbolFile/NativePDB/disassembly.cpp index 7d0c0b8e684ba..db3b85fa7e59f 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/disassembly.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/disassembly.cpp @@ -4,7 +4,7 @@ // Test that we can show disassembly and source. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/disassembly.lldbinit | FileCheck %s // Some context lines before the function. diff --git a/lldb/test/Shell/SymbolFile/NativePDB/function-types-builtins.cpp b/lldb/test/Shell/SymbolFile/NativePDB/function-types-builtins.cpp index 69d8d17179fe9..b2ef45feb4d01 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/function-types-builtins.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/function-types-builtins.cpp @@ -3,7 +3,7 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/function-types-builtins.lldbinit | FileCheck %s // Test that we can display function signatures with simple builtin diff --git a/lldb/test/Shell/SymbolFile/NativePDB/function-types-calling-conv.cpp b/lldb/test/Shell/SymbolFile/NativePDB/function-types-calling-conv.cpp index 42fd21637ea8a..92fa4394e860d 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/function-types-calling-conv.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/function-types-calling-conv.cpp @@ -3,7 +3,7 @@ // RUN: %clang_cl --target=i386-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/function-types-calling-conv.lldbinit | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp b/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp index ca2a84de7698a..c0de1ce6df4fd 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp @@ -4,7 +4,7 @@ // Test that we can display function signatures with class types. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/function-types-classes.lldbinit | FileCheck %s // This is just some unimportant helpers needed so that we can get reference and diff --git a/lldb/test/Shell/SymbolFile/NativePDB/global-classes.cpp b/lldb/test/Shell/SymbolFile/NativePDB/global-classes.cpp index 8f4aab6a8585d..8016d5200d410 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/global-classes.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/global-classes.cpp @@ -5,7 +5,7 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 \ // RUN: -Xclang -fkeep-static-consts -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/globals-classes.lldbinit | FileCheck %s enum class EnumType : unsigned { diff --git a/lldb/test/Shell/SymbolFile/NativePDB/global-ctor-dtor.cpp b/lldb/test/Shell/SymbolFile/NativePDB/global-ctor-dtor.cpp index 15b4d330fabb0..5f6c68d69023e 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/global-ctor-dtor.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/global-ctor-dtor.cpp @@ -4,7 +4,7 @@ // Global ctor and dtor should be globals decls. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -GS- -fno-addrsig -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -force -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols --dump-ast %t.exe | FileCheck %s +// RUN: lldb-test symbols --dump-ast %t.exe | FileCheck %s struct A { ~A() {}; diff --git a/lldb/test/Shell/SymbolFile/NativePDB/globals-bss.cpp b/lldb/test/Shell/SymbolFile/NativePDB/globals-bss.cpp index 9c65c26499cd1..7f508ecb31b4d 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/globals-bss.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/globals-bss.cpp @@ -5,7 +5,7 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb // RUN: llvm-readobj -S %t.exe | FileCheck --check-prefix=BSS %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/globals-bss.lldbinit 2>&1 | FileCheck %s int GlobalVariable = 0; diff --git a/lldb/test/Shell/SymbolFile/NativePDB/globals-fundamental.cpp b/lldb/test/Shell/SymbolFile/NativePDB/globals-fundamental.cpp index 2787e65928319..299dd0b02671d 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/globals-fundamental.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/globals-fundamental.cpp @@ -5,7 +5,7 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 \ // RUN: -Xclang -fkeep-static-consts -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/globals-fundamental.lldbinit | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/icf.cpp b/lldb/test/Shell/SymbolFile/NativePDB/icf.cpp index d9a7373bb12d6..d1e8732b68869 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/icf.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/icf.cpp @@ -4,7 +4,7 @@ // Test lldb finds the correct parent context decl for functions and class methods when icf happens. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -GS- -fno-addrsig -c /Fo%t.obj -- %s // RUN: lld-link -opt:icf -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols --dump-ast %t.exe | FileCheck %s +// RUN: lldb-test symbols --dump-ast %t.exe | FileCheck %s struct A { int f1(int x) { diff --git a/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp b/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp index 8c16828690301..7bc7e618667f7 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/incomplete-tag-type.cpp @@ -4,7 +4,7 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -c /Fo%t1.obj -- %p/Inputs/incomplete-tag-type.cpp // RUN: %clang_cl --target=x86_64-windows-msvc /O1 /Z7 -c /Fo%t2.obj -- %s // RUN: lld-link /debug:full /nodefaultlib /entry:main %t1.obj %t2.obj /out:%t.exe /pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -o \ +// RUN: %lldb -f %t.exe -o \ // RUN: "settings set interpreter.stop-command-source-on-error false" \ // RUN: -o "expression b" -o "expression d" -o "expression static_e_ref" -o "exit" 2>&1 | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites.test b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites.test index f1cf5ffdf7037..6293148d90ce4 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites.test +++ b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites.test @@ -3,7 +3,7 @@ # RUN: llvm-mc -triple=x86_64-windows-msvc --filetype=obj %p/Inputs/inline_sites.s > %t.obj # RUN: lld-link -debug:full -nodefaultlib -entry:main -base:0x140000000 %t.obj -out:%t.exe -# RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +# RUN: %lldb -f %t.exe -s \ # RUN: %p/Inputs/inline_sites.lldbinit 2>&1 | FileCheck %s # CHECK: (lldb) image dump line-table a.cpp -v diff --git a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp index 767149ea18c46..df6353e28303a 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp @@ -2,7 +2,7 @@ // REQUIRES: system-windows // RUN: %build -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/inline_sites_live.lldbinit 2>&1 | FileCheck %s void use(int) {} diff --git a/lldb/test/Shell/SymbolFile/NativePDB/lit.local.cfg b/lldb/test/Shell/SymbolFile/NativePDB/lit.local.cfg index c9b378b7a8a5a..02bc504eea55c 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/lit.local.cfg +++ b/lldb/test/Shell/SymbolFile/NativePDB/lit.local.cfg @@ -1,2 +1 @@ -if 'lldb-repro' in config.available_features: - config.unsupported = True +config.environment["LLDB_USE_NATIVE_PDB_READER"] = "1" diff --git a/lldb/test/Shell/SymbolFile/NativePDB/load-pdb.cpp b/lldb/test/Shell/SymbolFile/NativePDB/load-pdb.cpp index 8840a1242a044..3ff1dffab0775 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/load-pdb.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/load-pdb.cpp @@ -10,7 +10,7 @@ // RUN: -out:%t/executable/foo.exe -pdb:%t/executable/foo.pdb // Rename the PDB file so that the name is different from the name inside the executable (foo.exe). // RUN: mv %t/executable/foo.pdb %t/executable/bar.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t/executable/foo.exe \ +// RUN: %lldb %t/executable/foo.exe \ // RUN: -o "target symbols add %t/executable/bar.pdb" \ // RUN: -o "b main" \ // RUN: -o "image dump symfile" -o "quit" | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s b/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s index ad2d0704cdf41..85d92a2447939 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s +++ b/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s @@ -3,7 +3,7 @@ # RUN: llvm-mc -triple=x86_64-windows-msvc --filetype=obj %s > %t.obj # RUN: lld-link /debug:full /nodefaultlib /entry:main %t.obj /out:%t.exe /base:0x140000000 -# RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +# RUN: %lldb -f %t.exe -s \ # RUN: %p/Inputs/local-variables-registers.lldbinit 2>&1 | FileCheck %s # This file is compiled from following source file: diff --git a/lldb/test/Shell/SymbolFile/NativePDB/local-variables.cpp b/lldb/test/Shell/SymbolFile/NativePDB/local-variables.cpp index 9aa25adf6bcc7..f6576090b4f32 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/local-variables.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/local-variables.cpp @@ -2,7 +2,7 @@ // REQUIRES: system-windows // RUN: %build -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/local-variables.lldbinit 2>&1 | FileCheck %s int Function(int Param1, char Param2) { diff --git a/lldb/test/Shell/SymbolFile/NativePDB/locate-pdb.cpp b/lldb/test/Shell/SymbolFile/NativePDB/locate-pdb.cpp index 7b3f6f9767961..c0739e4dfaba4 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/locate-pdb.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/locate-pdb.cpp @@ -17,12 +17,12 @@ // RUN: -out:%t/executable/foo.exe -pdb:%t/symbols/bar.pdb // Find the PDB in its build location -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t/executable/foo.exe -s \ +// RUN: %lldb -f %t/executable/foo.exe -s \ // RUN: %p/Inputs/locate-pdb.lldbinit | FileCheck %s // Also find the PDB when it's adjacent to the executable // RUN: mv -f %t/symbols/bar.pdb %t/executable/bar.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t/executable/foo.exe -s \ +// RUN: %lldb -f %t/executable/foo.exe -s \ // RUN: %p/Inputs/locate-pdb.lldbinit | FileCheck %s int main(int argc, char** argv) { diff --git a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-address.cpp b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-address.cpp index a2c00a48bf9ea..36434b3860703 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-address.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-address.cpp @@ -3,7 +3,7 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -GR- -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -base:0x400000 -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -O "target create %t.exe" -o "image lookup -a 0x401000" -o "exit" | FileCheck %s --check-prefix=ADDRESS +// RUN: %lldb -O "target create %t.exe" -o "image lookup -a 0x401000" -o "exit" | FileCheck %s --check-prefix=ADDRESS int main(int argc, char **argv) { return 0; } diff --git a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp index f3aea8115f385..d035271893734 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp @@ -1,7 +1,7 @@ // clang-format off // RUN: %build -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/lookup-by-types.lldbinit 2>&1 | FileCheck %s class B; diff --git a/lldb/test/Shell/SymbolFile/NativePDB/nested-blocks-same-address.s b/lldb/test/Shell/SymbolFile/NativePDB/nested-blocks-same-address.s index 0ab76dacdaded..dc3ee844fe364 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/nested-blocks-same-address.s +++ b/lldb/test/Shell/SymbolFile/NativePDB/nested-blocks-same-address.s @@ -4,7 +4,7 @@ # Test when nested S_BLOCK32 have same address range, ResolveSymbolContext should return the innnermost block. # RUN: llvm-mc -triple=x86_64-windows-msvc --filetype=obj %s > %t.obj # RUN: lld-link /debug:full /nodefaultlib /entry:main %t.obj /out:%t.exe /base:0x140000000 -# RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -o "image lookup -a 0x14000103c -v" -o "exit" | FileCheck %s +# RUN: %lldb -f %t.exe -o "image lookup -a 0x14000103c -v" -o "exit" | FileCheck %s # This file is compiled from following source file: # $ clang-cl /Z7 /GS- /c /O2 test.cpp /Fatest.s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/nested-types.cpp b/lldb/test/Shell/SymbolFile/NativePDB/nested-types.cpp index b188b9f6806fc..f725037a220d9 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/nested-types.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/nested-types.cpp @@ -5,7 +5,7 @@ // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 \ // RUN: -Xclang -fkeep-static-consts -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/nested-types.lldbinit 2>&1 | FileCheck %s struct S { diff --git a/lldb/test/Shell/SymbolFile/NativePDB/s_constant.cpp b/lldb/test/Shell/SymbolFile/NativePDB/s_constant.cpp index ac3ecbbbf09df..32785b3b25cae 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/s_constant.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/s_constant.cpp @@ -5,7 +5,7 @@ // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %p/Inputs/s_constant.s > %t.obj // RUN: %build --compiler=clang-cl --nodefaultlib --mode=link -o %t.exe -- %t.obj -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/s_constant.lldbinit | FileCheck %s // clang-cl cannot generate S_CONSTANT records, but we need to test that we can diff --git a/lldb/test/Shell/SymbolFile/NativePDB/source-list.cpp b/lldb/test/Shell/SymbolFile/NativePDB/source-list.cpp index fb749c145aca9..73a32bde986dc 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/source-list.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/source-list.cpp @@ -4,7 +4,7 @@ // Test that we can set display source of functions. // RUN: %clang_cl --target=x86_64-windows-msvc -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/source-list.lldbinit | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/stack_unwinding01.cpp b/lldb/test/Shell/SymbolFile/NativePDB/stack_unwinding01.cpp index e96e3ed6a0107..9e799fb635a2f 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/stack_unwinding01.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/stack_unwinding01.cpp @@ -2,7 +2,7 @@ // REQUIRES: lld, system-windows // RUN: %build --compiler=clang-cl --nodefaultlib -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/stack_unwinding01.lldbinit 2>&1 | FileCheck %s diff --git a/lldb/test/Shell/SymbolFile/NativePDB/tag-types.cpp b/lldb/test/Shell/SymbolFile/NativePDB/tag-types.cpp index 03cf25d0d4c03..2d20375745ec3 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/tag-types.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/tag-types.cpp @@ -4,7 +4,7 @@ // Test that we can display tag types. // RUN: %clang_cl --target=x86_64-windows-msvc -GS- -Od -Z7 -c /Fo%t.obj -- %s // RUN: lld-link -debug:full -nodefaultlib -entry:main %t.obj -out:%t.exe -pdb:%t.pdb -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \ +// RUN: %lldb -f %t.exe -s \ // RUN: %p/Inputs/tag-types.lldbinit | FileCheck %s // Test struct diff --git a/lldb/test/Shell/SymbolFile/NativePDB/typedefs.cpp b/lldb/test/Shell/SymbolFile/NativePDB/typedefs.cpp index e303a4f43636d..17d23660c33db 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/typedefs.cpp +++ b/lldb/test/Shell/SymbolFile/NativePDB/typedefs.cpp @@ -2,7 +2,7 @@ // REQUIRES: system-windows // RUN: %build --compiler=clang-cl --nodefaultlib -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck %s +// RUN: lldb-test symbols -dump-ast %t.exe | FileCheck %s namespace A { namespace B { diff --git a/lldb/test/Shell/SymbolFile/PDB/ast-restore.test b/lldb/test/Shell/SymbolFile/PDB/ast-restore.test index 2763f46070244..a2597c46ba31b 100644 --- a/lldb/test/Shell/SymbolFile/PDB/ast-restore.test +++ b/lldb/test/Shell/SymbolFile/PDB/ast-restore.test @@ -1,10 +1,10 @@ REQUIRES: system-windows, msvc RUN: %build --compiler=msvc --nodefaultlib --output=%t.exe %S/Inputs/AstRestoreTest.cpp -RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=ENUM %s +RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=ENUM %s RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=ENUM %s RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=GLOBAL %s RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=BASE %s -RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=CLASS %s +RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=CLASS %s RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=CLASS %s RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=INNER %s RUN: lldb-test symbols -dump-ast %t.exe | FileCheck --check-prefix=TEMPLATE %s diff --git a/lldb/test/Shell/SymbolFile/PDB/compilands.test b/lldb/test/Shell/SymbolFile/PDB/compilands.test index ecee5eb50d399..f0fdce0f68cd9 100644 --- a/lldb/test/Shell/SymbolFile/PDB/compilands.test +++ b/lldb/test/Shell/SymbolFile/PDB/compilands.test @@ -2,7 +2,7 @@ REQUIRES: system-windows, msvc RUN: %build --compiler=clang-cl --mode=compile --arch=32 --nodefaultlib --output=%T/CompilandsTest.cpp.obj %S/Inputs/CompilandsTest.cpp RUN: %build --compiler=msvc --mode=link --arch=32 --nodefaultlib --output=%T/CompilandsTest.cpp.exe %T/CompilandsTest.cpp.obj RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %T/CompilandsTest.cpp.exe | FileCheck %s -RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols %T/CompilandsTest.cpp.exe | FileCheck %s +RUN: lldb-test symbols %T/CompilandsTest.cpp.exe | FileCheck %s ; Link default libraries diff --git a/lldb/test/Shell/SymbolFile/PDB/function-level-linking.test b/lldb/test/Shell/SymbolFile/PDB/function-level-linking.test index ec0ef57440070..ff8eec44e3dbd 100644 --- a/lldb/test/Shell/SymbolFile/PDB/function-level-linking.test +++ b/lldb/test/Shell/SymbolFile/PDB/function-level-linking.test @@ -2,4 +2,4 @@ REQUIRES: system-windows, lld RUN: %clang_cl_host /c /Zi /Gy %S/Inputs/FunctionLevelLinkingTest.cpp /o %t.obj RUN: lld-link /debug:full /nodefaultlib /entry:main /order:@%S/Inputs/FunctionLevelLinkingTest.ord %t.obj /out:%t.exe RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols -verify %t.exe -RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols -verify %t.exe +RUN: lldb-test symbols -verify %t.exe diff --git a/lldb/test/Shell/SymbolFile/PDB/lit.local.cfg b/lldb/test/Shell/SymbolFile/PDB/lit.local.cfg index c9b378b7a8a5a..cb354a3f34043 100644 --- a/lldb/test/Shell/SymbolFile/PDB/lit.local.cfg +++ b/lldb/test/Shell/SymbolFile/PDB/lit.local.cfg @@ -1,2 +1 @@ -if 'lldb-repro' in config.available_features: - config.unsupported = True +config.environment["LLDB_USE_NATIVE_PDB_READER"] = "0" diff --git a/lldb/test/Shell/SymbolFile/PDB/variables-locations.test b/lldb/test/Shell/SymbolFile/PDB/variables-locations.test index b5bfc6fe81af9..526e53bba3b85 100644 --- a/lldb/test/Shell/SymbolFile/PDB/variables-locations.test +++ b/lldb/test/Shell/SymbolFile/PDB/variables-locations.test @@ -1,6 +1,6 @@ REQUIRES: system-windows, lld RUN: %build --compiler=clang-cl --output=%t.exe %S/Inputs/VariablesLocationsTest.cpp -RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb -b -s %S/Inputs/VariablesLocationsTest.script -- %t.exe | FileCheck %s +RUN: %lldb -b -s %S/Inputs/VariablesLocationsTest.script -- %t.exe | FileCheck %s RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -b -s %S/Inputs/VariablesLocationsTest.script -- %t.exe | FileCheck %s CHECK: g_var = 2222 diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py index 81ae490f6a7dc..42968128f2702 100644 --- a/lldb/test/Shell/helper/toolchain.py +++ b/lldb/test/Shell/helper/toolchain.py @@ -283,20 +283,3 @@ def use_support_substitutions(config): llvm_config.add_tool_substitutions(support_tools, additional_tool_dirs) _disallow(config, "clang") - - -def use_lldb_repro_substitutions(config, mode): - lldb_init = _get_lldb_init_path(config) - substitutions = [ - ToolSubst( - "%lldb", - command=FindTool("lldb-repro"), - extra_args=[mode, "--no-lldbinit", "-S", lldb_init], - ), - ToolSubst( - "%lldb-init", - command=FindTool("lldb-repro"), - extra_args=[mode, "-S", lldb_init], - ), - ] - llvm_config.add_tool_substitutions(substitutions, [config.lldb_tools_dir]) diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py index 9a7b71889f8df..24c910d2fc561 100644 --- a/lldb/test/Shell/lit.cfg.py +++ b/lldb/test/Shell/lit.cfg.py @@ -59,15 +59,6 @@ config.environment["TSAN_OPTIONS"] = "halt_on_error=1" -# Support running the test suite under the lldb-repro wrapper. This makes it -# possible to capture a test suite run and then rerun all the test from the -# just captured reproducer. -lldb_repro_mode = lit_config.params.get("lldb-run-with-repro", None) -if lldb_repro_mode: - config.available_features.add("lldb-repro") - lit_config.note("Running Shell tests in {} mode.".format(lldb_repro_mode)) - toolchain.use_lldb_repro_substitutions(config, lldb_repro_mode) - if config.lldb_platform_url and config.cmake_sysroot and config.enable_remote: if re.match(r".*-linux.*", config.target_triple): config.available_features.add("remote-linux") diff --git a/lldb/tools/lldb-fuzzer/CMakeLists.txt b/lldb/tools/lldb-fuzzer/CMakeLists.txt index 4c081a9de53e2..e384ca1858398 100644 --- a/lldb/tools/lldb-fuzzer/CMakeLists.txt +++ b/lldb/tools/lldb-fuzzer/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(lldb-commandinterpreter-fuzzer) +add_subdirectory(lldb-dwarf-expression-fuzzer) add_subdirectory(lldb-expression-fuzzer) add_subdirectory(lldb-target-fuzzer) add_subdirectory(utils) diff --git a/lldb/tools/lldb-fuzzer/lldb-dwarf-expression-fuzzer/CMakeLists.txt b/lldb/tools/lldb-fuzzer/lldb-dwarf-expression-fuzzer/CMakeLists.txt new file mode 100644 index 0000000000000..464696fc051d6 --- /dev/null +++ b/lldb/tools/lldb-fuzzer/lldb-dwarf-expression-fuzzer/CMakeLists.txt @@ -0,0 +1,33 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + +add_llvm_fuzzer(lldb-dwarf-expression-fuzzer + EXCLUDE_FROM_ALL + lldb-dwarf-expression-fuzzer.cpp + ) + +if(TARGET lldb-dwarf-expression-fuzzer) + target_include_directories(lldb-dwarf-expression-fuzzer PRIVATE ..) + target_include_directories(lldb-dwarf-expression-fuzzer PRIVATE ${LLDB_SOURCE_ROOT}) + target_link_libraries(lldb-dwarf-expression-fuzzer + PRIVATE + lldbCore + lldbPluginExpressionParserClang + lldbPluginPlatformLinux + lldbPluginTypeSystemClang + lldbFuzzerUtils + ) + + add_custom_command(TARGET lldb-dwarf-expression-fuzzer PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/fuzzer-artifacts/dwarf-expression-artifacts + ) + + add_custom_target(fuzz-lldb-dwarf-expression + COMMENT "Running the LLDB DWARF expression evaluator fuzzer..." + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/fuzzer-artifacts/dwarf-expression-artifacts + COMMAND $ -artifact_prefix=dwarf-expression- + USES_TERMINAL + ) + set_target_properties(fuzz-lldb-dwarf-expression PROPERTIES FOLDER "LLDB/Fuzzer") +endif() diff --git a/lldb/tools/lldb-fuzzer/lldb-dwarf-expression-fuzzer/lldb-dwarf-expression-fuzzer.cpp b/lldb/tools/lldb-fuzzer/lldb-dwarf-expression-fuzzer/lldb-dwarf-expression-fuzzer.cpp new file mode 100644 index 0000000000000..86c3709b3a829 --- /dev/null +++ b/lldb/tools/lldb-fuzzer/lldb-dwarf-expression-fuzzer/lldb-dwarf-expression-fuzzer.cpp @@ -0,0 +1,83 @@ +//===-- lldb-target-fuzzer.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "utils/TempFile.h" + +#include "Plugins/Platform/Linux/PlatformLinux.h" +#include "lldb/Core/Debugger.h" +#include "lldb/Core/Value.h" +#include "lldb/Expression/DWARFExpression.h" +#include "lldb/Host/FileSystem.h" +#include "lldb/Host/HostInfo.h" +#include "lldb/Target/Target.h" + +using namespace lldb; +using namespace lldb_private; +using namespace lldb_private::plugin::dwarf; +using namespace lldb_fuzzer; + +extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) { + FileSystem::Initialize(); + HostInfo::Initialize(); + platform_linux::PlatformLinux::Initialize(); + return 0; +} + +static void Evaluate(llvm::ArrayRef expr, + lldb::ModuleSP module_sp = {}, DWARFUnit *unit = nullptr, + ExecutionContext *exe_ctx = nullptr) { + DataExtractor extractor(expr.data(), expr.size(), lldb::eByteOrderLittle, + /*addr_size*/ 4); + + llvm::Expected result = + DWARFExpression::Evaluate(exe_ctx, /*reg_ctx*/ nullptr, module_sp, + extractor, unit, lldb::eRegisterKindLLDB, + /*initial_value_ptr*/ nullptr, + /*object_address_ptr*/ nullptr); + + if (!result) + llvm::consumeError(result.takeError()); +} + +class MockTarget : public Target { +public: + MockTarget(Debugger &debugger, const ArchSpec &target_arch, + const lldb::PlatformSP &platform_sp, llvm::ArrayRef data) + : Target(debugger, target_arch, platform_sp, true), m_data(data) {} + + size_t ReadMemory(const Address &addr, void *dst, size_t dst_len, + Status &error, bool force_live_memory = false, + lldb::addr_t *load_addr_ptr = nullptr) override { + std::memcpy(dst, m_data.data(), m_data.size()); + return m_data.size(); + } + +private: + llvm::ArrayRef m_data; +}; + +extern "C" int LLVMFuzzerTestOneInput(uint8_t *data, size_t size) { + // We're going to use the first half of the input data as the DWARF expression + // and the second half as memory. + const size_t partition = size / 2; + llvm::ArrayRef expression_data(data, partition); + llvm::ArrayRef memory_data(data + partition, size - partition); + + // Create a mock target for reading memory. + ArchSpec arch("i386-pc-linux"); + Platform::SetHostPlatform( + platform_linux::PlatformLinux::CreateInstance(true, &arch)); + lldb::DebuggerSP debugger_sp = Debugger::CreateInstance(); + lldb::PlatformSP platform_sp; + auto target_sp = std::make_shared(*debugger_sp, arch, platform_sp, + memory_data); + ExecutionContext exe_ctx(static_cast(target_sp), false); + + Evaluate(expression_data); + return 0; +} diff --git a/lldb/tools/lldb-test/lldb-test.cpp b/lldb/tools/lldb-test/lldb-test.cpp index 50b85d4b51209..1960240dc4151 100644 --- a/lldb/tools/lldb-test/lldb-test.cpp +++ b/lldb/tools/lldb-test/lldb-test.cpp @@ -13,6 +13,7 @@ #include "Plugins/TypeSystem/Clang/TypeSystemClang.h" #include "lldb/Breakpoint/BreakpointLocation.h" #include "lldb/Core/Debugger.h" +#include "lldb/Core/Mangled.h" #include "lldb/Core/Module.h" #include "lldb/Core/Section.h" #include "lldb/Expression/IRMemoryMap.h" @@ -23,6 +24,7 @@ #include "lldb/Symbol/LineTable.h" #include "lldb/Symbol/SymbolFile.h" #include "lldb/Symbol/Symtab.h" +#include "lldb/Symbol/Type.h" #include "lldb/Symbol/TypeList.h" #include "lldb/Symbol/TypeMap.h" #include "lldb/Symbol/VariableList.h" @@ -179,6 +181,10 @@ static cl::opt Find( static cl::opt Name("name", cl::desc("Name to find."), cl::sub(SymbolsSubcommand)); +static cl::opt MangledName( + "mangled-name", + cl::desc("Mangled name to find. Only compatible when searching types"), + cl::sub(SymbolsSubcommand)); static cl::opt Regex("regex", cl::desc("Search using regular expressions (available for variables " @@ -468,6 +474,9 @@ static lldb::DescriptionLevel GetDescriptionLevel() { } Error opts::symbols::findFunctions(lldb_private::Module &Module) { + if (!MangledName.empty()) + return make_string_error("Cannot search functions by mangled name."); + SymbolFile &Symfile = *Module.GetSymbolFile(); SymbolContextList List; auto compiler_context = parseCompilerContext(); @@ -529,6 +538,8 @@ Error opts::symbols::findBlocks(lldb_private::Module &Module) { assert(!Regex); assert(!File.empty()); assert(Line != 0); + if (!MangledName.empty()) + return make_string_error("Cannot search blocks by mangled name."); SymbolContextList List; @@ -563,6 +574,9 @@ Error opts::symbols::findBlocks(lldb_private::Module &Module) { } Error opts::symbols::findNamespaces(lldb_private::Module &Module) { + if (!MangledName.empty()) + return make_string_error("Cannot search namespaces by mangled name."); + SymbolFile &Symfile = *Module.GetSymbolFile(); Expected ContextOr = getDeclContext(Symfile); if (!ContextOr) @@ -585,11 +599,15 @@ Error opts::symbols::findTypes(lldb_private::Module &Module) { Expected ContextOr = getDeclContext(Symfile); if (!ContextOr) return ContextOr.takeError(); + ; TypeQueryOptions Opts = TypeQueryOptions::e_module_search; if (FindInAnyModule) Opts |= TypeQueryOptions::e_ignore_modules; TypeResults results; + if (!Name.empty() && !MangledName.empty()) + return make_string_error("Cannot search by both name and mangled name."); + if (!Name.empty()) { if (ContextOr->IsValid()) { TypeQuery query(*ContextOr, ConstString(Name), Opts); @@ -602,6 +620,20 @@ Error opts::symbols::findTypes(lldb_private::Module &Module) { query.AddLanguage(Language::GetLanguageTypeFromString(Language)); Symfile.FindTypes(query, results); } + } else if (!MangledName.empty()) { + Opts = TypeQueryOptions::e_search_by_mangled_name; + if (ContextOr->IsValid()) { + TypeQuery query(*ContextOr, ConstString(MangledName), Opts); + if (!Language.empty()) + query.AddLanguage(Language::GetLanguageTypeFromString(Language)); + Symfile.FindTypes(query, results); + } else { + TypeQuery query(MangledName, Opts); + if (!Language.empty()) + query.AddLanguage(Language::GetLanguageTypeFromString(Language)); + Symfile.FindTypes(query, results); + } + } else { TypeQuery query(parseCompilerContext(), Opts); if (!Language.empty()) @@ -619,6 +651,9 @@ Error opts::symbols::findTypes(lldb_private::Module &Module) { } Error opts::symbols::findVariables(lldb_private::Module &Module) { + if (!MangledName.empty()) + return make_string_error("Cannot search variables by mangled name."); + SymbolFile &Symfile = *Module.GetSymbolFile(); VariableList List; if (Regex) { diff --git a/lldb/unittests/Host/FileActionTest.cpp b/lldb/unittests/Host/FileActionTest.cpp index 56227cd587e5b..ac067c4d3349b 100644 --- a/lldb/unittests/Host/FileActionTest.cpp +++ b/lldb/unittests/Host/FileActionTest.cpp @@ -34,7 +34,9 @@ TEST(FileActionTest, OpenReadWrite) { TEST(FileActionTest, OpenReadOnly) { FileAction Action; Action.Open(49, FileSpec("/tmp_1"), /*read*/ true, /*write*/ false); +#ifndef _WIN32 EXPECT_TRUE(Action.GetActionArgument() & (O_NOCTTY | O_RDONLY)); +#endif EXPECT_FALSE(Action.GetActionArgument() & O_WRONLY); } diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp index b90fbb7830995..365ebc8e52c24 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp @@ -760,7 +760,7 @@ class NewStyle(object): EXPECT_EQ(arginfo.get().max_positional_args, 3u); } -#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 3 +#if PY_VERSION_HEX >= 0x03030000 // the old implementation of GetArgInfo just doesn't work on builtins. diff --git a/lldb/utils/CMakeLists.txt b/lldb/utils/CMakeLists.txt index 00c81e655b756..d08f66f7b6c56 100644 --- a/lldb/utils/CMakeLists.txt +++ b/lldb/utils/CMakeLists.txt @@ -1,3 +1,2 @@ add_subdirectory(lit-cpuid) add_subdirectory(lldb-dotest) -add_subdirectory(lldb-repro) diff --git a/lldb/utils/lldb-repro/CMakeLists.txt b/lldb/utils/lldb-repro/CMakeLists.txt deleted file mode 100644 index 8ca02b9fb8193..0000000000000 --- a/lldb/utils/lldb-repro/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -add_custom_target(lldb-repro) -add_dependencies(lldb-repro lldb-test-depends) -set_target_properties(lldb-repro PROPERTIES FOLDER "LLDB/Utils") - -# Generate lldb-repro Python script for each build mode. -if(LLDB_BUILT_STANDALONE) - set(config_types ".") - if(CMAKE_CONFIGURATION_TYPES) - set(config_types ${CMAKE_CONFIGURATION_TYPES}) - endif() - - foreach(config_type ${config_types}) - string(REPLACE ${CMAKE_CFG_INTDIR} ${config_type} config_runtime_output_dir ${LLVM_RUNTIME_OUTPUT_INTDIR}) - configure_file(lldb-repro.py ${config_runtime_output_dir}/lldb-repro COPYONLY) - endforeach() -elseif(NOT "${CMAKE_CFG_INTDIR}" STREQUAL ".") - foreach(LLVM_BUILD_MODE ${CMAKE_CONFIGURATION_TYPES}) - string(REPLACE ${CMAKE_CFG_INTDIR} ${LLVM_BUILD_MODE} LLDB_REPRO_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) - configure_file(lldb-repro.py ${LLDB_REPRO_DIR}/lldb-repro COPYONLY) - endforeach() -else() - configure_file(lldb-repro.py ${LLVM_RUNTIME_OUTPUT_INTDIR}/lldb-repro COPYONLY) -endif() diff --git a/lldb/utils/lldb-repro/lldb-repro.py b/lldb/utils/lldb-repro/lldb-repro.py deleted file mode 100755 index 30788d6a815e1..0000000000000 --- a/lldb/utils/lldb-repro/lldb-repro.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python -"""lldb-repro - -lldb-repro is a utility to transparently capture and replay debugger sessions -through the command line driver. Its used to test the reproducers by running -the test suite twice. - -During the first run, with 'capture' as its first argument, it captures a -reproducer for every lldb invocation and saves it to a well-know location -derived from the arguments and current working directory. - -During the second run, with 'replay' as its first argument, the test suite is -run again but this time every invocation of lldb replays the previously -recorded session. -""" - -import hashlib -import os -import shutil -import subprocess -import sys -import tempfile - - -def help(): - print("usage: {} capture|replay [args]".format(sys.argv[0])) - - -def main(): - if len(sys.argv) < 2: - help() - return 1 - - # Compute an MD5 hash based on the input arguments and the current working - # directory. - h = hashlib.md5() - h.update(" ".join(sys.argv[2:]).encode("utf-8")) - h.update(os.getcwd().encode("utf-8")) - input_hash = h.hexdigest() - - # Use the hash to "uniquely" identify a reproducer path. - reproducer_path = os.path.join(tempfile.gettempdir(), input_hash) - - # Create a new lldb invocation with capture or replay enabled. - lldb = os.path.join(os.path.dirname(sys.argv[0]), "lldb") - new_args = [lldb] - if sys.argv[1] == "replay": - new_args.extend(["--replay", reproducer_path]) - elif sys.argv[1] == "capture": - new_args.extend( - [ - "--capture", - "--capture-path", - reproducer_path, - "--reproducer-generate-on-exit", - ] - ) - new_args.extend(sys.argv[2:]) - else: - help() - return 1 - - exit_code = subprocess.call(new_args) - - # The driver always exists with a zero exit code during replay. Store the - # exit code and return that for tests that expect a non-zero exit code. - exit_code_path = os.path.join(reproducer_path, "exit_code.txt") - if sys.argv[1] == "replay": - replay_exit_code = exit_code - with open(exit_code_path, "r") as f: - exit_code = int(f.read()) - if replay_exit_code != 0: - print("error: replay failed with exit code {}".format(replay_exit_code)) - print("invocation: " + " ".join(new_args)) - # Return 1 if the expected exit code is 0 or vice versa. - return 1 if (exit_code == 0) else 0 - shutil.rmtree(reproducer_path, True) - elif sys.argv[1] == "capture": - with open(exit_code_path, "w") as f: - f.write("%d" % exit_code) - - return exit_code - - -if __name__ == "__main__": - exit(main()) diff --git a/llvm/docs/CodeOfConduct.rst b/llvm/docs/CodeOfConduct.rst index f8fff8bac52a5..e075b4ff377f9 100644 --- a/llvm/docs/CodeOfConduct.rst +++ b/llvm/docs/CodeOfConduct.rst @@ -24,12 +24,12 @@ the spirit in which it's intended - a guide to make it easier to communicate and participate in the community. This code of conduct applies to all spaces managed by the LLVM project or The -LLVM Foundation. This includes IRC channels, mailing lists, bug trackers, LLVM -events such as the developer meetings and socials, and any other forums created -by the project that the community uses for communication. It applies to all of -your communication and conduct in these spaces, including emails, chats, things -you say, slides, videos, posters, signs, or even t-shirts you display in these -spaces. +LLVM Foundation. This includes IRC and Discord channels, mailing lists, bug +trackers, LLVM events such as the developer meetings and socials, and any other +forums created by the project that the community uses for communication. It +applies to all of your communication and conduct in these spaces, including +emails, chats, things you say, slides, videos, posters, signs, or even t-shirts +you display in these spaces. In rare cases, violations of this code outside of these spaces may affect a person’s ability to participate within these spaces. Important examples diff --git a/llvm/docs/CodeReview.rst b/llvm/docs/CodeReview.rst index 56798ae4faf0c..c3ca82d061f00 100644 --- a/llvm/docs/CodeReview.rst +++ b/llvm/docs/CodeReview.rst @@ -89,10 +89,10 @@ When Is an RFC Required? Some changes are too significant for just a code review. Changes that should change the LLVM Language Reference (e.g., adding new target-independent intrinsics), adding language extensions in Clang, and so on, require an RFC -(Request for Comment) email on the project's ``*-dev`` mailing list first. For -changes that promise significant impact on users and/or downstream code bases, -reviewers can request an RFC achieving consensus before proceeding with code -review. That having been said, posting initial patches can help with +(Request for Comment) topic on the `LLVM Discussion Forums `_ +first. For changes that promise significant impact on users and/or downstream +code bases, reviewers can request an RFC achieving consensus before proceeding +with code review. That having been said, posting initial patches can help with discussions on an RFC. Code-Review Workflow @@ -142,12 +142,18 @@ from specific performance tests), please explain as many of these up front as possible. This allows the patch author and reviewers to make the most efficient use of their time. +.. _lgtm_how_a_patch_is_accepted: + LGTM - How a Patch Is Accepted ------------------------------ A patch is approved to be committed when a reviewer accepts it, and this is almost always associated with a message containing the text "LGTM" (which -stands for Looks Good To Me). Only approval from a single reviewer is required. +stands for Looks Good To Me). + +Only approval from a single reviewer is required, unless the pull request +has required reviewers. In which case, you must have approval from all of those +reviewers. When providing an unqualified LGTM (approval to commit), it is the responsibility of the reviewer to have reviewed all of the discussion and @@ -248,8 +254,8 @@ larger features. Common ways to speed up review times for your patches are: get this patch landed and ping it every couple of days. If it is not urgent, the common courtesy ping rate is one week. Remember that you're asking for valuable time from other professional developers. -* Ask for help on IRC. Developers on IRC will be able to either help you - directly, or tell you who might be a good reviewer. +* Ask for help on Discord. Developers on Discord will be able to either help + you directly, or tell you who might be a good reviewer. * Split your patch into multiple smaller patches that build on each other. The smaller your patch is, the higher the probability that somebody will take a quick look at it. When doing this, it is helpful to add "[N/M]" (for 1 <= N <= M) to diff --git a/llvm/docs/Contributing.rst b/llvm/docs/Contributing.rst index 17477d1c044d7..2f8ce7b7a1055 100644 --- a/llvm/docs/Contributing.rst +++ b/llvm/docs/Contributing.rst @@ -6,7 +6,7 @@ Contributing to LLVM Thank you for your interest in contributing to LLVM! There are multiple ways to contribute, and we appreciate all contributions. In case you have questions, you can either use the `Forum`_ or, for a more interactive chat, go to our -`Discord server`_ or the IRC #llvm channel on `irc.oftc.net`_. +`Discord server`_. If you want to contribute code, please familiarize yourself with the :doc:`DeveloperPolicy`. diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst index 0ecf1423e6037..c50e100443cf3 100644 --- a/llvm/docs/DeveloperPolicy.rst +++ b/llvm/docs/DeveloperPolicy.rst @@ -201,7 +201,7 @@ within an area of a project: * aid release managers with backporting and other release-related activities, * be a point of contact for contributors who need help (answering questions - on Discord/Discourse/IRC or holding office hours). + on Discord/Discourse or holding office hours). Each top-level project in the monorepo will specify one or more lead maintainers who are responsible for ensuring community needs are diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index d43390bdad455..1f4f5d577efda 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -373,11 +373,7 @@ Guidance for office hours hosts * When starting an office hours session, consider typing something like "*Hi, I'm available for chats in the next half hour at* video chat URL. *I'm looking forward to having conversations on the video chat or here.*" on the - LLVM chat channels that you are already on. These could include: - - * the `#office-hours Discord channel - `__. - * :ref:`IRC` + the `#office-hours Discord channel `__. Doing this can help: * overcome potential anxiety to call in for a first time, @@ -388,27 +384,18 @@ Guidance for office hours hosts from the list above. -.. _IRC: - -IRC ---- +Discord +------- Users and developers of the LLVM project (including subprojects such as Clang) -can be found in #llvm on `irc.oftc.net `_. The channel -is actively moderated. +can be found on the community's `Discord `_ +chat server. The server is actively moderated. -The #llvm-build channel has a bot for +The #buildbot-status channel has a bot for `LLVM buildbot `_ status changes. The -bot will post a message with a link to a build bot and a blamelist when a build -goes from passing to failing and again (without the blamelist) when the build -goes from failing back to passing. It is a good channel for actively monitoring -build statuses, but it is a noisy channel due to the automated messages. The -channel is not actively moderated. - -In addition to the traditional IRC there is a -`Discord `_ -chat server available. To sign up, please use this -`invitation link `_. +bot will update the channel with a link to a build bot when a build goes from +passing to failing and again when the build goes from failing back to passing. +It is a great way to actively monitor the status of the build. .. _meetups-social-events: diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index ce4308022bf9f..d785d9da9a7f4 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -138,10 +138,16 @@ you won't encounter merge conflicts when landing the PR. collaborating with others on a single branch, be careful how and when you push changes. ``--force-with-lease`` may be useful in this situation. +Approvals +--------- + +Before merging a PR you must have the required approvals. See +:ref:`lgtm_how_a_patch_is_accepted` for more details. + Landing your change ------------------- -When your PR has been accepted you can merge your changes. +When your PR has been approved you can merge your changes. If you do not have write permissions for the repository, the merge button in GitHub's web interface will be disabled. If this is the case, continue following diff --git a/llvm/docs/ResponseGuide.rst b/llvm/docs/ResponseGuide.rst index c77e904cc9d0f..59111746f0ff9 100644 --- a/llvm/docs/ResponseGuide.rst +++ b/llvm/docs/ResponseGuide.rst @@ -189,7 +189,7 @@ taken, but below is a list of possible resolutions: violation. * A private verbal warning and/or reprimand from the committee to the individual(s) involved and request to stop this behavior. This conversation - may happen in person, email, by phone, video chat, or IRC. + may happen in person, email, by phone, video chat, or Discord. * Request that the reportee avoid any interaction with, and physical proximity to, another person for the remainder of the event. * Refusal of alcoholic beverage purchases by the reportee at LLVM events. @@ -202,8 +202,7 @@ taken, but below is a list of possible resolutions: * Immediately ending any volunteer responsibilities and privileges the reportee holds. * An imposed suspension (e.g., asking someone to "take a week off" from mailing - lists, bug tracker, IRC, Discord, repositories, or other communication - forms). + lists, bug tracker, Discord, repositories, or other communication forms). * A permanent or temporary ban from some or all LLVM Project spaces (online or in person). diff --git a/llvm/docs/_templates/indexsidebar.html b/llvm/docs/_templates/indexsidebar.html index 7e6d63f7e150a..4cfe08081fb37 100644 --- a/llvm/docs/_templates/indexsidebar.html +++ b/llvm/docs/_templates/indexsidebar.html @@ -15,7 +15,7 @@

Getting Involved

  • Contributing to LLVM
  • Submitting Bug Reports
  • Mailing Lists
  • -
  • IRC
  • +
  • Discord
  • Meetups and Social Events
  • diff --git a/llvm/include/llvm/ADT/Statistic.h b/llvm/include/llvm/ADT/Statistic.h index a64cfce3ba44a..aa7197cc6caef 100644 --- a/llvm/include/llvm/ADT/Statistic.h +++ b/llvm/include/llvm/ADT/Statistic.h @@ -177,8 +177,8 @@ void EnableStatistics(bool DoPrintOnExit = true); /// Check if statistics are enabled. bool AreStatisticsEnabled(); -/// Return a file stream to print our output on. -std::unique_ptr CreateInfoOutputFile(); +/// Return a stream to print our output on. +std::unique_ptr CreateInfoOutputFile(); /// Print statistics to the file returned by CreateInfoOutputFile(). void PrintStatistics(); diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def index 3e23e398f6a79..fd53a26ef8fc1 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def @@ -1671,6 +1671,21 @@ TLI_DEFINE_ENUM_INTERNAL(htons) TLI_DEFINE_STRING_INTERNAL("htons") TLI_DEFINE_SIG_INTERNAL(Int16, Int16) +/// double hypot(double x, double y); +TLI_DEFINE_ENUM_INTERNAL(hypot) +TLI_DEFINE_STRING_INTERNAL("hypot") +TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl, Dbl) + +/// float hypotf(float x, float y); +TLI_DEFINE_ENUM_INTERNAL(hypotf) +TLI_DEFINE_STRING_INTERNAL("hypotf") +TLI_DEFINE_SIG_INTERNAL(Flt, Flt, Flt) + +/// long double hypotl(long double x, long double y); +TLI_DEFINE_ENUM_INTERNAL(hypotl) +TLI_DEFINE_STRING_INTERNAL("hypotl") +TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, LDbl) + /// int iprintf(const char *format, ...); TLI_DEFINE_ENUM_INTERNAL(iprintf) TLI_DEFINE_STRING_INTERNAL("iprintf") diff --git a/llvm/include/llvm/CGData/StableFunctionMap.h b/llvm/include/llvm/CGData/StableFunctionMap.h new file mode 100644 index 0000000000000..a2e1abb49a3eb --- /dev/null +++ b/llvm/include/llvm/CGData/StableFunctionMap.h @@ -0,0 +1,138 @@ +//===- StableFunctionMap.h -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// This defines the StableFunctionMap class, to track similar functions. +// It provides a mechanism to map stable hashes of functions to their +// corresponding metadata. It includes structures for storing function details +// and methods for managing and querying these mappings. +// +//===---------------------------------------------------------------------===// + +#ifndef LLVM_CGDATA_STABLEFUNCTIONMAP_H +#define LLVM_CGDATA_STABLEFUNCTIONMAP_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/IR/StructuralHash.h" + +namespace llvm { + +using IndexPairHash = std::pair; +using IndexOperandHashVecType = SmallVector; + +/// A stable function is a function with a stable hash while tracking the +/// locations of ignored operands and their hashes. +struct StableFunction { + /// The combined stable hash of the function. + stable_hash Hash; + /// The name of the function. + std::string FunctionName; + /// The name of the module the function is in. + std::string ModuleName; + /// The number of instructions. + unsigned InstCount; + /// A vector of pairs of IndexPair and operand hash which was skipped. + IndexOperandHashVecType IndexOperandHashes; + + StableFunction(stable_hash Hash, const std::string FunctionName, + const std::string ModuleName, unsigned InstCount, + IndexOperandHashVecType &&IndexOperandHashes) + : Hash(Hash), FunctionName(FunctionName), ModuleName(ModuleName), + InstCount(InstCount), + IndexOperandHashes(std::move(IndexOperandHashes)) {} + StableFunction() = default; +}; + +struct StableFunctionMap { + /// An efficient form of StableFunction for fast look-up + struct StableFunctionEntry { + /// The combined stable hash of the function. + stable_hash Hash; + /// Id of the function name. + unsigned FunctionNameId; + /// Id of the module name. + unsigned ModuleNameId; + /// The number of instructions. + unsigned InstCount; + /// A map from an IndexPair to a stable_hash which was skipped. + std::unique_ptr IndexOperandHashMap; + + StableFunctionEntry( + stable_hash Hash, unsigned FunctionNameId, unsigned ModuleNameId, + unsigned InstCount, + std::unique_ptr IndexOperandHashMap) + : Hash(Hash), FunctionNameId(FunctionNameId), + ModuleNameId(ModuleNameId), InstCount(InstCount), + IndexOperandHashMap(std::move(IndexOperandHashMap)) {} + }; + + using HashFuncsMapType = + DenseMap>>; + + /// Get the HashToFuncs map for serialization. + const HashFuncsMapType &getFunctionMap() const { return HashToFuncs; } + + /// Get the NameToId vector for serialization. + const SmallVector getNames() const { return IdToName; } + + /// Get an existing ID associated with the given name or create a new ID if it + /// doesn't exist. + unsigned getIdOrCreateForName(StringRef Name); + + /// Get the name associated with a given ID + std::optional getNameForId(unsigned Id) const; + + /// Insert a `StableFunction` object into the function map. This method + /// handles the uniquing of string names and create a `StableFunctionEntry` + /// for insertion. + void insert(const StableFunction &Func); + + /// Merge a \p OtherMap into this function map. + void merge(const StableFunctionMap &OtherMap); + + /// \returns true if there is no stable function entry. + bool empty() const { return size() == 0; } + + enum SizeType { + UniqueHashCount, // The number of unique hashes in HashToFuncs. + TotalFunctionCount, // The number of total functions in HashToFuncs. + MergeableFunctionCount, // The number of functions that can be merged based + // on their hash. + }; + + /// \returns the size of StableFunctionMap. + /// \p Type is the type of size to return. + size_t size(SizeType Type = UniqueHashCount) const; + + /// Finalize the stable function map by trimming content. + void finalize(); + +private: + /// Insert a `StableFunctionEntry` into the function map directly. This + /// method assumes that string names have already been uniqued and the + /// `StableFunctionEntry` is ready for insertion. + void insert(std::unique_ptr FuncEntry) { + assert(!Finalized && "Cannot insert after finalization"); + HashToFuncs[FuncEntry->Hash].emplace_back(std::move(FuncEntry)); + } + + /// A map from a stable_hash to a vector of functions with that hash. + HashFuncsMapType HashToFuncs; + /// A vector of strings to hold names. + SmallVector IdToName; + /// A map from StringRef (name) to an ID. + StringMap NameToId; + /// True if the function map is finalized with minimal content. + bool Finalized = false; + + friend struct StableFunctionMapRecord; +}; + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/CGData/StableFunctionMapRecord.h b/llvm/include/llvm/CGData/StableFunctionMapRecord.h new file mode 100644 index 0000000000000..0517f2c20d72f --- /dev/null +++ b/llvm/include/llvm/CGData/StableFunctionMapRecord.h @@ -0,0 +1,71 @@ +//===- StableFunctionMapRecord.h -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// This defines the StableFunctionMapRecord structure, which provides +// functionality for managing and serializing a StableFunctionMap. It includes +// methods for serialization to and from raw and YAML streams, as well as +// utilities for merging and finalizing function maps. +// +//===---------------------------------------------------------------------===// + +#ifndef LLVM_CGDATA_STABLEFUNCTIONMAPRECORD_H +#define LLVM_CGDATA_STABLEFUNCTIONMAPRECORD_H + +#include "llvm/CGData/StableFunctionMap.h" +#include "llvm/ObjectYAML/YAML.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +struct StableFunctionMapRecord { + std::unique_ptr FunctionMap; + + StableFunctionMapRecord() { + FunctionMap = std::make_unique(); + } + + StableFunctionMapRecord(std::unique_ptr FunctionMap) + : FunctionMap(std::move(FunctionMap)) {} + + /// A static helper function to serialize the stable function map without + /// owning the stable function map. + static void serialize(raw_ostream &OS, const StableFunctionMap *FunctionMap); + + /// Serialize the stable function map to a raw_ostream. + void serialize(raw_ostream &OS) const; + + /// Deserialize the stable function map from a raw_ostream. + void deserialize(const unsigned char *&Ptr); + + /// Serialize the stable function map to a YAML stream. + void serializeYAML(yaml::Output &YOS) const; + + /// Deserialize the stable function map from a YAML stream. + void deserializeYAML(yaml::Input &YIS); + + /// Finalize the stable function map by trimming content. + void finalize() { FunctionMap->finalize(); } + + /// Merge the stable function map into this one. + void merge(const StableFunctionMapRecord &Other) { + FunctionMap->merge(*Other.FunctionMap); + } + + /// \returns true if the stable function map is empty. + bool empty() const { return FunctionMap->empty(); } + + /// Print the stable function map in a YAML format. + void print(raw_ostream &OS = llvm::errs()) const { + yaml::Output YOS(OS); + serializeYAML(YOS); + } +}; + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h index 1cb410a0c31c6..feb05de20b457 100644 --- a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h +++ b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h @@ -20,10 +20,11 @@ class Value; /// Parameters (see the expansion example below): /// (the builder, %addr, %loaded, %new_val, ordering, -/// /* OUT */ %success, /* OUT */ %new_loaded) -using CreateCmpXchgInstFun = - function_ref; +/// /* OUT */ %success, /* OUT */ %new_loaded, +/// %MetadataSrc) +using CreateCmpXchgInstFun = function_ref; /// Expand an atomic RMW instruction into a loop utilizing /// cmpxchg. You'll want to make sure your target machine likes cmpxchg diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index b6309a9ea0ec7..cd7ebcf54c9e1 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -28,7 +28,7 @@ namespace llvm { class GenericMachineInstr : public MachineInstr { constexpr static unsigned PoisonFlags = NoUWrap | NoSWrap | NoUSWrap | IsExact | Disjoint | NonNeg | - FmNoNans | FmNoInfs; + FmNoNans | FmNoInfs | SameSign; public: GenericMachineInstr() = delete; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index 6d71c150c8da6..343a0172ff39e 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -273,6 +273,11 @@ inline LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type) { LegalityPredicate typePairInSet(unsigned TypeIdx0, unsigned TypeIdx1, std::initializer_list> TypesInit); +/// True iff the given types for the given tuple of type indexes is one of the +/// specified type tuple. +LegalityPredicate +typeTupleInSet(unsigned TypeIdx0, unsigned TypeIdx1, unsigned Type2, + std::initializer_list> TypesInit); /// True iff the given types for the given pair of type indexes is one of the /// specified type pairs. LegalityPredicate typePairAndMemDescInSet( @@ -504,6 +509,15 @@ class LegalizeRuleSet { using namespace LegalityPredicates; return actionIf(Action, typePairInSet(typeIdx(0), typeIdx(1), Types)); } + + LegalizeRuleSet & + actionFor(LegalizeAction Action, + std::initializer_list> Types) { + using namespace LegalityPredicates; + return actionIf(Action, + typeTupleInSet(typeIdx(0), typeIdx(1), typeIdx(2), Types)); + } + /// Use the given action when type indexes 0 and 1 is any type pair in the /// given list. /// Action should be an action that requires mutation. @@ -615,6 +629,12 @@ class LegalizeRuleSet { return *this; return actionFor(LegalizeAction::Legal, Types); } + LegalizeRuleSet & + legalFor(bool Pred, std::initializer_list> Types) { + if (!Pred) + return *this; + return actionFor(LegalizeAction::Legal, Types); + } /// The instruction is legal when type index 0 is any type in the given list /// and imm index 0 is anything. LegalizeRuleSet &legalForTypeWithAnyImm(std::initializer_list Types) { diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index c41e74ec7ebdc..a38dd34a17097 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1266,7 +1266,8 @@ class MachineIRBuilder { /// /// \return a MachineInstrBuilder for the newly created instruction. MachineInstrBuilder buildICmp(CmpInst::Predicate Pred, const DstOp &Res, - const SrcOp &Op0, const SrcOp &Op1); + const SrcOp &Op0, const SrcOp &Op1, + std::optional Flags = std::nullopt); /// Build and insert a \p Res = G_FCMP \p Pred\p Op0, \p Op1 /// diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index 4016247376c4f..37653631cc238 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -180,7 +180,7 @@ std::optional getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI); /// \p VReg is defined by a G_CONSTANT, return the corresponding value. -APInt getIConstantFromReg(Register VReg, const MachineRegisterInfo &MRI); +const APInt &getIConstantFromReg(Register VReg, const MachineRegisterInfo &MRI); /// Simple struct used to hold a constant integer value and a virtual /// register. diff --git a/llvm/include/llvm/CodeGen/GlobalMerge.h b/llvm/include/llvm/CodeGen/GlobalMerge.h index 1577bcf8903f5..f1fb467fc7757 100644 --- a/llvm/include/llvm/CodeGen/GlobalMerge.h +++ b/llvm/include/llvm/CodeGen/GlobalMerge.h @@ -30,6 +30,9 @@ struct GlobalMergeOptions { bool MergeExternal = true; /// Whether we should merge constant global variables. bool MergeConstantGlobals = false; + /// Whether we should merge constant global variables aggressively without + /// looking at use. + bool MergeConstAggressive = false; /// Whether we should try to optimize for size only. /// Currently, this applies a dead simple heuristic: only consider globals /// used in minsize functions for merging. diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 3605173247463..ead6bbe1d5f64 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -119,6 +119,7 @@ class MachineInstr Disjoint = 1 << 19, // Each bit is zero in at least one of the inputs. NoUSWrap = 1 << 20, // Instruction supports geps // no unsigned signed wrap. + SameSign = 1 << 21 // Both operands have the same sign. }; private: diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h index d1c71fc95818c..72054ab1d3c21 100644 --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -480,7 +480,8 @@ namespace llvm { Pass *createGlobalMergePass(const TargetMachine *TM, unsigned MaximalOffset, bool OnlyOptimizeForSize = false, bool MergeExternalByDefault = false, - bool MergeConstantByDefault = false); + bool MergeConstantByDefault = false, + bool MergeConstAggressiveByDefault = false); /// This pass splits the stack into a safe stack and an unsafe stack to /// protect against stack-based overflow vulnerabilities. diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index b3e249b7ebd5c..96667952a16ef 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -533,9 +533,7 @@ struct BinaryOpc_match { if (!Flags.has_value()) return true; - SDNodeFlags TmpFlags = *Flags; - TmpFlags.intersectWith(N->getFlags()); - return TmpFlags == *Flags; + return (*Flags & N->getFlags()) == *Flags; } return false; @@ -668,9 +666,7 @@ inline BinaryOpc_match m_Or(const LHS &L, const RHS &R) { template inline BinaryOpc_match m_DisjointOr(const LHS &L, const RHS &R) { - SDNodeFlags Flags; - Flags.setDisjoint(true); - return BinaryOpc_match(ISD::OR, L, R, Flags); + return BinaryOpc_match(ISD::OR, L, R, SDNodeFlags::Disjoint); } template @@ -813,9 +809,7 @@ template struct UnaryOpc_match { if (!Flags.has_value()) return true; - SDNodeFlags TmpFlags = *Flags; - TmpFlags.intersectWith(N->getFlags()); - return TmpFlags == *Flags; + return (*Flags & N->getFlags()) == *Flags; } return false; @@ -848,9 +842,7 @@ template inline UnaryOpc_match m_ZExt(const Opnd &Op) { template inline UnaryOpc_match m_NNegZExt(const Opnd &Op) { - SDNodeFlags Flags; - Flags.setNonNeg(true); - return UnaryOpc_match(ISD::ZERO_EXTEND, Op, Flags); + return UnaryOpc_match(ISD::ZERO_EXTEND, Op, SDNodeFlags::NonNeg); } template inline auto m_SExt(const Opnd &Op) { diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index e82bdb6906163..e03773f46ae09 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1064,17 +1064,13 @@ class SelectionDAG { /// addressing some offset of an object. i.e. if a load is split into multiple /// components, create an add nuw from the base pointer to the offset. SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) { - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); - return getMemBasePlusOffset(Ptr, Offset, SL, Flags); + return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); } SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) { // The object itself can't wrap around the address space, so it shouldn't be // possible for the adds of the offsets to the split parts to overflow. - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); - return getMemBasePlusOffset(Ptr, Offset, SL, Flags); + return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); } /// Return a new CALLSEQ_START node, that starts new call frame, in which @@ -1599,6 +1595,9 @@ class SelectionDAG { SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, SDValue Op2); + /// Expand the specified \c ISD::FSINCOS node as the Legalize pass would. + bool expandFSINCOS(SDNode *Node, SmallVectorImpl &Results); + /// Expand the specified \c ISD::VAARG node as the Legalize pass would. SDValue expandVAArg(SDNode *Node); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 26488413fe582..739ce05e94734 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -391,6 +391,7 @@ struct SDNodeFlags { None = 0, NoUnsignedWrap = 1 << 0, NoSignedWrap = 1 << 1, + NoWrap = NoUnsignedWrap | NoSignedWrap, Exact = 1 << 2, Disjoint = 1 << 3, NonNeg = 1 << 4, @@ -410,16 +411,18 @@ struct SDNodeFlags { NoFPExcept = 1 << 12, // Instructions with attached 'unpredictable' metadata on IR level. Unpredictable = 1 << 13, + // Compare instructions which may carry the samesign flag. + SameSign = 1 << 14, // NOTE: Please update LargestValue in LLVM_DECLARE_ENUM_AS_BITMASK below // the class definition when adding new flags. PoisonGeneratingFlags = NoUnsignedWrap | NoSignedWrap | Exact | Disjoint | - NonNeg | NoNaNs | NoInfs, + NonNeg | NoNaNs | NoInfs | SameSign, }; /// Default constructor turns off all optimization flags. - SDNodeFlags() : Flags(0) {} + SDNodeFlags(unsigned Flags = SDNodeFlags::None) : Flags(Flags) {} /// Propagate the fast-math-flags from an IR FPMathOperator. void copyFMF(const FPMathOperator &FPMO) { @@ -437,6 +440,7 @@ struct SDNodeFlags { void setNoSignedWrap(bool b) { setFlag(b); } void setExact(bool b) { setFlag(b); } void setDisjoint(bool b) { setFlag(b); } + void setSameSign(bool b) { setFlag(b); } void setNonNeg(bool b) { setFlag(b); } void setNoNaNs(bool b) { setFlag(b); } void setNoInfs(bool b) { setFlag(b); } @@ -453,6 +457,7 @@ struct SDNodeFlags { bool hasNoSignedWrap() const { return Flags & NoSignedWrap; } bool hasExact() const { return Flags & Exact; } bool hasDisjoint() const { return Flags & Disjoint; } + bool hasSameSign() const { return Flags & SameSign; } bool hasNonNeg() const { return Flags & NonNeg; } bool hasNoNaNs() const { return Flags & NoNaNs; } bool hasNoInfs() const { return Flags & NoInfs; } @@ -467,14 +472,22 @@ struct SDNodeFlags { bool operator==(const SDNodeFlags &Other) const { return Flags == Other.Flags; } - - /// Clear any flags in this flag set that aren't also set in Flags. All - /// flags will be cleared if Flags are undefined. - void intersectWith(const SDNodeFlags Flags) { this->Flags &= Flags.Flags; } + void operator&=(const SDNodeFlags &OtherFlags) { Flags &= OtherFlags.Flags; } + void operator|=(const SDNodeFlags &OtherFlags) { Flags |= OtherFlags.Flags; } }; LLVM_DECLARE_ENUM_AS_BITMASK(decltype(SDNodeFlags::None), - SDNodeFlags::Unpredictable); + SDNodeFlags::SameSign); + +inline SDNodeFlags operator|(SDNodeFlags LHS, SDNodeFlags RHS) { + LHS |= RHS; + return LHS; +} + +inline SDNodeFlags operator&(SDNodeFlags LHS, SDNodeFlags RHS) { + LHS &= RHS; + return LHS; +} /// Represents one node in the SelectionDAG. /// @@ -1013,6 +1026,7 @@ END_TWO_BYTE_PACK() SDNodeFlags getFlags() const { return Flags; } void setFlags(SDNodeFlags NewFlags) { Flags = NewFlags; } + void dropFlags(unsigned Mask) { Flags &= ~Mask; } /// Clear any flags in this node that aren't also set in Flags. /// If Flags is not in a defined state then this has no effect. diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h index d00fe5c5535f5..24f303a7d9d13 100644 --- a/llvm/include/llvm/CodeGen/TileShapeInfo.h +++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h @@ -34,9 +34,31 @@ class ShapeT { if (MRI) deduceImm(MRI); } + // When ShapeT has multiple shapes, we only use Shapes (never use Row and Col) + // and ImmShapes. Due to the most case is only one shape (just simply use + // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector + // Shapes to keep the speed and code simplicity. + // TODO: The upper solution is a temporary way to minimize current tile + // register allocation code changes. It can not handle both Reg shape and + // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2 + // is imm shape). Refine me when we have more multi-tile shape instructions! + ShapeT(ArrayRef ShapesOperands, + const MachineRegisterInfo *MRI = nullptr) + : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), + ColImm(InvalidImmShape) { + assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!"); + + for (auto *Shape : ShapesOperands) + Shapes.push_back(Shape); + + if (MRI) + deduceImm(MRI); + } ShapeT() : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), ColImm(InvalidImmShape) {} + // TODO: We need to extern cmp operator for multi-shapes if + // we have requirement in the future. bool operator==(const ShapeT &Shape) const { MachineOperand *R = Shape.Row; MachineOperand *C = Shape.Col; @@ -53,13 +75,40 @@ class ShapeT { bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); } - MachineOperand *getRow() const { return Row; } + MachineOperand *getRow(unsigned I = 0) const { + if (Shapes.empty()) + return Row; + assert(Shapes.size() / 2 >= I && "Get invalid row from id!"); + return Shapes[I * 2]; + } - MachineOperand *getCol() const { return Col; } + MachineOperand *getCol(unsigned I = 0) const { + if (Shapes.empty()) + return Col; + assert(Shapes.size() / 2 >= I && "Get invalid col from id!"); + return Shapes[I * 2 + 1]; + } - int64_t getRowImm() const { return RowImm; } + int64_t getRowImm(unsigned I = 0) const { + if (ImmShapes.empty()) + return RowImm; + assert(ImmShapes.size() / 2 >= I && "Get invalid imm row from id!"); + return ImmShapes[I * 2]; + } - int64_t getColImm() const { return ColImm; } + int64_t getColImm(unsigned I = 0) const { + if (ImmShapes.empty()) + return ColImm; + assert(ImmShapes.size() / 2 >= I && "Get invalid imm col from id!"); + return ImmShapes[I * 2 + 1]; + } + + unsigned getShapeNum() { + if (Shapes.empty()) + return isValid() ? 1 : 0; + else + return Shapes.size() / 2; + } bool isValid() { return (Row != nullptr) && (Col != nullptr); } @@ -72,14 +121,35 @@ class ShapeT { for (const MachineOperand &DefMO : MRI->def_operands(Reg)) { const auto *MI = DefMO.getParent(); if (MI->isMoveImmediate()) { - Imm = MI->getOperand(1).getImm(); + assert(MI->getNumOperands() == 2 && + "Unsupported number of operands in instruction for setting " + "row/column."); + if (MI->getOperand(1).isImm()) { + Imm = MI->getOperand(1).getImm(); + } else { + assert(MI->getOperand(1).isImplicit() && + "Operand 1 is assumed to be implicit."); + Imm = 0; + } break; } } return Imm; }; - RowImm = GetImm(Row->getReg()); - ColImm = GetImm(Col->getReg()); + if (Shapes.empty()) { // Single Shape + RowImm = GetImm(Row->getReg()); + ColImm = GetImm(Col->getReg()); + // The number of rows of 2nd destination buffer is assigned by the one of + // 1st destination buffer. If the column size is equal to zero, the row + // size should be reset to zero too. + if (ColImm == 0) + Row = Col; + } else { // Multiple Shapes + for (auto *Shape : Shapes) { + int64_t ImmShape = GetImm(Shape->getReg()); + ImmShapes.push_back(ImmShape); + } + } } private: @@ -88,6 +158,9 @@ class ShapeT { MachineOperand *Col; int64_t RowImm = -1; int64_t ColImm = -1; + // Multiple Shapes + SmallVector Shapes; + SmallVector ImmShapes; }; } // namespace llvm diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h index 741dcc236b300..593f7a41337cb 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h @@ -315,7 +315,7 @@ class StaticLibraryDefinitionGenerator : public DefinitionGenerator { /// Returns a list of filenames of dynamic libraries that this archive has /// imported. This class does not load these libraries by itself. User is - /// responsible for making sure these libraries are avaliable to the JITDylib. + /// responsible for making sure these libraries are available to the JITDylib. const std::set &getImportedDynamicLibraries() const { return ImportedDynamicLibraries; } diff --git a/llvm/include/llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h b/llvm/include/llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h index ef42cc5f798fd..8a4740c1dd9cb 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_EXECUTIONENGINE_ORC_JITLINKREDIRECABLEMANAGER_H -#define LLVM_EXECUTIONENGINE_ORC_JITLINKREDIRECABLEMANAGER_H +#ifndef LLVM_EXECUTIONENGINE_ORC_JITLINKREDIRECABLESYMBOLMANAGER_H +#define LLVM_EXECUTIONENGINE_ORC_JITLINKREDIRECABLESYMBOLMANAGER_H #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/ExecutionEngine/Orc/RedirectionManager.h" @@ -103,4 +103,4 @@ class JITLinkRedirectableSymbolManager : public RedirectableSymbolManager, } // namespace orc } // namespace llvm -#endif +#endif // LLVM_EXECUTIONENGINE_ORC_JITLINKREDIRECABLESYMBOLMANAGER_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ReOptimizeLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/ReOptimizeLayer.h index 4adc3efad5573..cd185d54b2e7c 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ReOptimizeLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ReOptimizeLayer.h @@ -178,4 +178,4 @@ class ReOptimizeLayer : public IRLayer, public ResourceManager { } // namespace orc } // namespace llvm -#endif +#endif // LLVM_EXECUTIONENGINE_ORC_REOPTIMIZELAYER_H diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 92226a687cad4..143b538b361c9 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2461,7 +2461,7 @@ def int_amdgcn_permlanex16 : // is a 32-bit constant whose high 8 bits must be zero which selects // the lanes to read from. def int_amdgcn_mov_dpp8 : - Intrinsic<[llvm_anyint_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index dada426368995..e30d37f69f781 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -92,6 +92,4 @@ def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, L def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>], [LLVMScalarOrSameVectorWidth<0, llvm_double_ty>], [IntrNoMem]>; def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - -def int_dx_group_memory_barrier_with_group_sync : DefaultAttrsIntrinsic<[], [], []>; } diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td index 4279661473d85..bf20080229aa4 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td @@ -10,13 +10,6 @@ // //===----------------------------------------------------------------------===// -class VCIXSuffix { - list suffix = !cond(!eq(range, "c"): ["e8mf8", "e8mf4", "e8mf2", "e8m1", "e8m2", "e8m4", "e8m8"], - !eq(range, "s"): ["e16mf4", "e16mf2", "e16m1", "e16m2", "e16m4", "e16m8"], - !eq(range, "i"): ["e32mf2", "e32m1", "e32m2", "e32m4", "e32m8"], - !eq(range, "l"): ["e64m1", "e64m2", "e64m4", "e64m8"]); -} - let TargetPrefix = "riscv" in { // Output: (vector_out) // Input: (bit<27-26>, bit<24-20>, scalar_in, vl) or diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 0ecca157077fd..c42397024e45a 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5917,6 +5917,41 @@ let TargetPrefix = "x86" in { [ImmArg>, ImmArg>, ImmArg>]>; + // AMX-FP8 + def int_x86_tdpbf8ps : ClangBuiltin<"__builtin_ia32_tdpbf8ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, + ImmArg>, ImmArg>]>; + def int_x86_tdpbhf8ps : ClangBuiltin<"__builtin_ia32_tdpbhf8ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, + ImmArg>, ImmArg>]>; + def int_x86_tdphbf8ps : ClangBuiltin<"__builtin_ia32_tdphbf8ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, + ImmArg>, ImmArg>]>; + def int_x86_tdphf8ps : ClangBuiltin<"__builtin_ia32_tdphf8ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, + ImmArg>, ImmArg>]>; + + // AMX-TRANSPOSE + def int_x86_t2rpntlvwz0 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz0t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0t1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_t2rpntlvwz1t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1t1">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], + [ImmArg>]>; + def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty], + [ImmArg>, ImmArg>]>; + // AMX - internal intrinsics def int_x86_ldtilecfg_internal : ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">, @@ -5994,6 +6029,27 @@ let TargetPrefix = "x86" in { [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; + + def int_x86_t2rpntlvwz0_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_t2rpntlvwz0t1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_t2rpntlvwz1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_t2rpntlvwz1t1_internal : + Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; + def int_x86_ttransposed_internal : + ClangBuiltin<"__builtin_ia32_ttransposed_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>; } //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h index 902d1305c818a..7c405025630c9 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -135,16 +135,20 @@ template <> struct MappingTraits { } }; -struct FunctionSummaryYaml { +struct GlobalValueSummaryYaml { + // Commonly used fields unsigned Linkage, Visibility; bool NotEligibleToImport, Live, IsLocal, CanAutoHide; unsigned ImportType; - std::vector Refs; - std::vector TypeTests; - std::vector TypeTestAssumeVCalls, - TypeCheckedLoadVCalls; - std::vector TypeTestAssumeConstVCalls, - TypeCheckedLoadConstVCalls; + // Fields for AliasSummary + std::optional Aliasee; + // Fields for FunctionSummary + std::vector Refs = {}; + std::vector TypeTests = {}; + std::vector TypeTestAssumeVCalls = {}; + std::vector TypeCheckedLoadVCalls = {}; + std::vector TypeTestAssumeConstVCalls = {}; + std::vector TypeCheckedLoadConstVCalls = {}; }; } // End yaml namespace @@ -176,8 +180,8 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionSummary::ConstVCall) namespace llvm { namespace yaml { -template <> struct MappingTraits { - static void mapping(IO &io, FunctionSummaryYaml& summary) { +template <> struct MappingTraits { + static void mapping(IO &io, GlobalValueSummaryYaml &summary) { io.mapOptional("Linkage", summary.Linkage); io.mapOptional("Visibility", summary.Visibility); io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport); @@ -185,6 +189,7 @@ template <> struct MappingTraits { io.mapOptional("Local", summary.IsLocal); io.mapOptional("CanAutoHide", summary.CanAutoHide); io.mapOptional("ImportType", summary.ImportType); + io.mapOptional("Aliasee", summary.Aliasee); io.mapOptional("Refs", summary.Refs); io.mapOptional("TypeTests", summary.TypeTests); io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls); @@ -199,7 +204,7 @@ template <> struct MappingTraits { } // End yaml namespace } // End llvm namespace -LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionSummaryYaml) +LLVM_YAML_IS_SEQUENCE_VECTOR(GlobalValueSummaryYaml) namespace llvm { namespace yaml { @@ -207,61 +212,99 @@ namespace yaml { // FIXME: Add YAML mappings for the rest of the module summary. template <> struct CustomMappingTraits { static void inputOne(IO &io, StringRef Key, GlobalValueSummaryMapTy &V) { - std::vector FSums; - io.mapRequired(Key.str().c_str(), FSums); + std::vector GVSums; + io.mapRequired(Key.str().c_str(), GVSums); uint64_t KeyInt; if (Key.getAsInteger(0, KeyInt)) { io.setError("key not an integer"); return; } auto &Elem = V.try_emplace(KeyInt, /*IsAnalysis=*/false).first->second; - for (auto &FSum : FSums) { + for (auto &GVSum : GVSums) { + GlobalValueSummary::GVFlags GVFlags( + static_cast(GVSum.Linkage), + static_cast(GVSum.Visibility), + GVSum.NotEligibleToImport, GVSum.Live, GVSum.IsLocal, + GVSum.CanAutoHide, + static_cast(GVSum.ImportType)); + if (GVSum.Aliasee) { + auto ASum = std::make_unique(GVFlags); + if (!V.count(*GVSum.Aliasee)) + V.emplace(*GVSum.Aliasee, /*IsAnalysis=*/false); + ValueInfo AliaseeVI(/*IsAnalysis=*/false, &*V.find(*GVSum.Aliasee)); + // Note: Aliasee cannot be filled until all summaries are loaded. + // This is done in fixAliaseeLinks() which is called in + // MappingTraits::mapping(). + ASum->setAliasee(AliaseeVI, /*Aliasee=*/nullptr); + Elem.SummaryList.push_back(std::move(ASum)); + continue; + } SmallVector Refs; - Refs.reserve(FSum.Refs.size()); - for (auto &RefGUID : FSum.Refs) { + Refs.reserve(GVSum.Refs.size()); + for (auto &RefGUID : GVSum.Refs) { auto It = V.try_emplace(RefGUID, /*IsAnalysis=*/false).first; Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*It)); } Elem.SummaryList.push_back(std::make_unique( - GlobalValueSummary::GVFlags( - static_cast(FSum.Linkage), - static_cast(FSum.Visibility), - FSum.NotEligibleToImport, FSum.Live, FSum.IsLocal, - FSum.CanAutoHide, - static_cast(FSum.ImportType)), - /*NumInsts=*/0, FunctionSummary::FFlags{}, std::move(Refs), - SmallVector{}, std::move(FSum.TypeTests), - std::move(FSum.TypeTestAssumeVCalls), - std::move(FSum.TypeCheckedLoadVCalls), - std::move(FSum.TypeTestAssumeConstVCalls), - std::move(FSum.TypeCheckedLoadConstVCalls), + GVFlags, /*NumInsts=*/0, FunctionSummary::FFlags{}, std::move(Refs), + SmallVector{}, std::move(GVSum.TypeTests), + std::move(GVSum.TypeTestAssumeVCalls), + std::move(GVSum.TypeCheckedLoadVCalls), + std::move(GVSum.TypeTestAssumeConstVCalls), + std::move(GVSum.TypeCheckedLoadConstVCalls), ArrayRef{}, ArrayRef{}, ArrayRef{})); } } static void output(IO &io, GlobalValueSummaryMapTy &V) { for (auto &P : V) { - std::vector FSums; + std::vector GVSums; for (auto &Sum : P.second.SummaryList) { if (auto *FSum = dyn_cast(Sum.get())) { std::vector Refs; Refs.reserve(FSum->refs().size()); for (auto &VI : FSum->refs()) Refs.push_back(VI.getGUID()); - FSums.push_back(FunctionSummaryYaml{ + GVSums.push_back(GlobalValueSummaryYaml{ FSum->flags().Linkage, FSum->flags().Visibility, static_cast(FSum->flags().NotEligibleToImport), static_cast(FSum->flags().Live), static_cast(FSum->flags().DSOLocal), static_cast(FSum->flags().CanAutoHide), - FSum->flags().ImportType, Refs, FSum->type_tests(), - FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(), + FSum->flags().ImportType, /*Aliasee=*/std::nullopt, Refs, + FSum->type_tests(), FSum->type_test_assume_vcalls(), + FSum->type_checked_load_vcalls(), FSum->type_test_assume_const_vcalls(), FSum->type_checked_load_const_vcalls()}); - } + } else if (auto *ASum = dyn_cast(Sum.get()); + ASum && ASum->hasAliasee()) { + GVSums.push_back(GlobalValueSummaryYaml{ + ASum->flags().Linkage, ASum->flags().Visibility, + static_cast(ASum->flags().NotEligibleToImport), + static_cast(ASum->flags().Live), + static_cast(ASum->flags().DSOLocal), + static_cast(ASum->flags().CanAutoHide), + ASum->flags().ImportType, + /*Aliasee=*/ASum->getAliaseeGUID()}); + } + } + if (!GVSums.empty()) + io.mapRequired(llvm::utostr(P.first).c_str(), GVSums); + } + } + static void fixAliaseeLinks(GlobalValueSummaryMapTy &V) { + for (auto &P : V) { + for (auto &Sum : P.second.SummaryList) { + if (auto *Alias = dyn_cast(Sum.get())) { + ValueInfo AliaseeVI = Alias->getAliaseeVI(); + auto AliaseeSL = AliaseeVI.getSummaryList(); + if (AliaseeSL.empty()) { + ValueInfo EmptyVI; + Alias->setAliasee(EmptyVI, nullptr); + } else + Alias->setAliasee(AliaseeVI, AliaseeSL[0].get()); + } } - if (!FSums.empty()) - io.mapRequired(llvm::utostr(P.first).c_str(), FSums); } } }; @@ -281,6 +324,9 @@ template <> struct CustomMappingTraits { template <> struct MappingTraits { static void mapping(IO &io, ModuleSummaryIndex& index) { io.mapOptional("GlobalValueMap", index.GlobalValueMap); + if (!io.outputting()) + CustomMappingTraits::fixAliaseeLinks( + index.GlobalValueMap); io.mapOptional("TypeIdMap", index.TypeIdMap); io.mapOptional("WithGlobalValueDeadStripping", index.WithGlobalValueDeadStripping); diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h index 8d4a7e7ddce5f..61f8febe882fd 100644 --- a/llvm/include/llvm/Support/Alignment.h +++ b/llvm/include/llvm/Support/Alignment.h @@ -14,7 +14,7 @@ // power of two, its minimum value is 1 which means no alignment requirements. // // - MaybeAlign is an optional type, it may be undefined or set. When it's set -// you can get the underlying Align type by using the getValue() method. +// you can get the underlying Align type by using the value() method. // //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Support/NVPTXAddrSpace.h b/llvm/include/llvm/Support/NVPTXAddrSpace.h new file mode 100644 index 0000000000000..93eae39e3d230 --- /dev/null +++ b/llvm/include/llvm/Support/NVPTXAddrSpace.h @@ -0,0 +1,33 @@ +//===---------------- NVPTXAddrSpace.h -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// NVPTX address space definition +/// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_NVPTXADDRSPACE_H +#define LLVM_SUPPORT_NVPTXADDRSPACE_H + +namespace llvm { +namespace NVPTXAS { +enum AddressSpace : unsigned { + ADDRESS_SPACE_GENERIC = 0, + ADDRESS_SPACE_GLOBAL = 1, + ADDRESS_SPACE_SHARED = 3, + ADDRESS_SPACE_CONST = 4, + ADDRESS_SPACE_LOCAL = 5, + + ADDRESS_SPACE_PARAM = 101, +}; +} // end namespace NVPTXAS + +} // end namespace llvm + +#endif // LLVM_SUPPORT_NVPTXADDRSPACE_H diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h index 5ec8a718d5a3e..1e07fbe64f7d3 100644 --- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h +++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h @@ -511,6 +511,7 @@ enum OperandEncoding { ENCODINGS ENCODING_max }; ENUM_ENTRY(TYPE_VK, "mask register") \ ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair") \ ENUM_ENTRY(TYPE_TMM, "tile") \ + ENUM_ENTRY(TYPE_TMM_PAIR, "tile pair") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \ diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index d1d0c5ff87341..79c07bc2fc920 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -148,6 +148,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/include/llvm/Target/TargetMacroFusion.td b/llvm/include/llvm/Target/TargetMacroFusion.td index 0306794ef50d1..eafc0b08c1edf 100644 --- a/llvm/include/llvm/Target/TargetMacroFusion.td +++ b/llvm/include/llvm/Target/TargetMacroFusion.td @@ -14,7 +14,7 @@ def first_fusion_target : FusionTarget; def second_fusion_target : FusionTarget; def both_fusion_target : FusionTarget; -// Base class of FusionPredicate, etc. The avaliable variables are: +// Base class of FusionPredicate, etc. The available variables are: // * const TargetInstrInfo &TII // * const TargetSubtargetInfo &STI // * const MachineRegisterInfo &MRI diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index 073e19f8187c6..a62b4df420ec6 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -264,6 +264,8 @@ X86_FEATURE_COMPAT(AVX10_2_512, "avx10.2-512", 0) //FIXME: make MOVRS _COMPAT defined when gcc landed relate patch. X86_FEATURE (MOVRS, "movrs") X86_FEATURE (ZU, "zu") +X86_FEATURE (AMX_FP8, "amx-fp8") +X86_FEATURE (AMX_TRANSPOSE, "amx-transpose") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h b/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h index eb682c437b94b..02adcd8bfd45d 100644 --- a/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h +++ b/llvm/include/llvm/Transforms/IPO/LowerTypeTests.h @@ -195,6 +195,13 @@ struct ByteArrayBuilder { bool isJumpTableCanonical(Function *F); +/// Specifies how to drop type tests. +enum class DropTestKind { + None, /// Do not drop type tests (default). + Assume, /// Drop only llvm.assumes using type test value. + All, /// Drop the type test and all uses. +}; + } // end namespace lowertypetests class LowerTypeTestsPass : public PassInfoMixin { @@ -202,13 +209,15 @@ class LowerTypeTestsPass : public PassInfoMixin { ModuleSummaryIndex *ExportSummary = nullptr; const ModuleSummaryIndex *ImportSummary = nullptr; - bool DropTypeTests = true; + lowertypetests::DropTestKind DropTypeTests = + lowertypetests::DropTestKind::None; public: LowerTypeTestsPass() : UseCommandLine(true) {} LowerTypeTestsPass(ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary, - bool DropTypeTests = false) + lowertypetests::DropTestKind DropTypeTests = + lowertypetests::DropTestKind::None) : ExportSummary(ExportSummary), ImportSummary(ImportSummary), DropTypeTests(DropTypeTests) {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h index a4be24e32c527..1e8ef0102450e 100644 --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -175,6 +175,14 @@ void CloneFunctionInto(Function *NewFunc, const Function *OldFunc, ValueMapTypeRemapper *TypeMapper = nullptr, ValueMaterializer *Materializer = nullptr); +/// Clone OldFunc's attributes into NewFunc, transforming values based on the +/// mappings in VMap. +void CloneFunctionAttributesInto(Function *NewFunc, const Function *OldFunc, + ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + ValueMapTypeRemapper *TypeMapper = nullptr, + ValueMaterializer *Materializer = nullptr); + void CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, const Instruction *StartingInst, ValueToValueMapTy &VMap, bool ModuleLevelChanges, diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index d08be1e55c853..2cb2612bf611e 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -1095,19 +1095,6 @@ static Value *simplifyDivRem(Instruction::BinaryOps Opcode, Value *Op0, if (match(Op1, m_Zero())) return PoisonValue::get(Ty); - // If any element of a constant divisor fixed width vector is zero or undef - // the behavior is undefined and we can fold the whole op to poison. - auto *Op1C = dyn_cast(Op1); - auto *VTy = dyn_cast(Ty); - if (Op1C && VTy) { - unsigned NumElts = VTy->getNumElements(); - for (unsigned i = 0; i != NumElts; ++i) { - Constant *Elt = Op1C->getAggregateElement(i); - if (Elt && (Elt->isNullValue() || Q.isUndefValue(Elt))) - return PoisonValue::get(Ty); - } - } - // poison / X -> poison // poison % X -> poison if (isa(Op0)) diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp index caed62679a683..8e233a5c8d7a5 100644 --- a/llvm/lib/Analysis/MustExecute.cpp +++ b/llvm/lib/Analysis/MustExecute.cpp @@ -162,6 +162,9 @@ static bool CanProveNotTakenFirstIteration(const BasicBlock *ExitBlock, /// Collect all blocks from \p CurLoop which lie on all possible paths from /// the header of \p CurLoop (inclusive) to BB (exclusive) into the set /// \p Predecessors. If \p BB is the header, \p Predecessors will be empty. +/// Note: It's possible that we encounter Irreducible control flow, due to +/// which, we may find that a few predecessors of \p BB are not a part of the +/// \p CurLoop. We only return Predecessors that are a part of \p CurLoop. static void collectTransitivePredecessors( const Loop *CurLoop, const BasicBlock *BB, SmallPtrSetImpl &Predecessors) { @@ -171,6 +174,8 @@ static void collectTransitivePredecessors( return; SmallVector WorkList; for (const auto *Pred : predecessors(BB)) { + if (!CurLoop->contains(Pred)) + continue; Predecessors.insert(Pred); WorkList.push_back(Pred); } @@ -187,7 +192,7 @@ static void collectTransitivePredecessors( // We can ignore backedge of all loops containing BB to get a sligtly more // optimistic result. for (const auto *PredPred : predecessors(Pred)) - if (Predecessors.insert(PredPred).second) + if (CurLoop->contains(PredPred) && Predecessors.insert(PredPred).second) WorkList.push_back(PredPred); } } diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 0ee83d217a500..7f0b98ab3c151 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -300,6 +300,7 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_expf); TLI.setUnavailable(LibFunc_floorf); TLI.setUnavailable(LibFunc_fmodf); + TLI.setUnavailable(LibFunc_hypotf); TLI.setUnavailable(LibFunc_log10f); TLI.setUnavailable(LibFunc_logf); TLI.setUnavailable(LibFunc_modff); @@ -331,6 +332,7 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_floorl); TLI.setUnavailable(LibFunc_fmodl); TLI.setUnavailable(LibFunc_frexpl); + TLI.setUnavailable(LibFunc_hypotl); TLI.setUnavailable(LibFunc_ldexpl); TLI.setUnavailable(LibFunc_log10l); TLI.setUnavailable(LibFunc_logl); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index aa5142f336240..dfa3ecd191fac 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -6003,14 +6003,32 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts, if (IncValue == P) continue; + Instruction *CxtI = P->getIncomingBlock(U)->getTerminator(); + + // If the Use is a select of this phi, use the fp class of the other + // operand to break the recursion. Same around 2-operand phi nodes + Value *V; + if (match(IncValue, m_Select(m_Value(), m_Specific(P), m_Value(V))) || + match(IncValue, m_Select(m_Value(), m_Value(V), m_Specific(P)))) { + IncValue = V; + } else if (auto *IncPhi = dyn_cast(IncValue); + IncPhi && IncPhi->getNumIncomingValues() == 2) { + for (int Idx = 0; Idx < 2; ++Idx) { + if (IncPhi->getIncomingValue(Idx) == P) { + IncValue = IncPhi->getIncomingValue(1 - Idx); + CxtI = IncPhi->getIncomingBlock(1 - Idx)->getTerminator(); + break; + } + } + } + KnownFPClass KnownSrc; // Recurse, but cap the recursion to two levels, because we don't want // to waste time spinning around in loops. We need at least depth 2 to // detect known sign bits. computeKnownFPClass(IncValue, DemandedElts, InterestedClasses, KnownSrc, PhiRecursionLimit, - Q.getWithoutCondContext().getWithInstruction( - P->getIncomingBlock(U)->getTerminator())); + Q.getWithoutCondContext().getWithInstruction(CxtI)); if (First) { Known = KnownSrc; diff --git a/llvm/lib/CGData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt index 157b0dfb7f9fc..003173139f36c 100644 --- a/llvm/lib/CGData/CMakeLists.txt +++ b/llvm/lib/CGData/CMakeLists.txt @@ -4,6 +4,8 @@ add_llvm_component_library(LLVMCGData CodeGenDataWriter.cpp OutlinedHashTree.cpp OutlinedHashTreeRecord.cpp + StableFunctionMap.cpp + StableFunctionMapRecord.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/CGData diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp new file mode 100644 index 0000000000000..cfef5b22fe784 --- /dev/null +++ b/llvm/lib/CGData/StableFunctionMap.cpp @@ -0,0 +1,167 @@ +//===-- StableFunctionMap.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This implements the functionality for the StableFunctionMap class, which +// manages the mapping of stable function hashes to their metadata. It includes +// methods for inserting, merging, and finalizing function entries, as well as +// utilities for handling function names and IDs. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CGData/StableFunctionMap.h" + +#define DEBUG_TYPE "stable-function-map" + +using namespace llvm; + +unsigned StableFunctionMap::getIdOrCreateForName(StringRef Name) { + auto It = NameToId.find(Name); + if (It != NameToId.end()) + return It->second; + unsigned Id = IdToName.size(); + assert(Id == NameToId.size() && "ID collision"); + IdToName.emplace_back(Name.str()); + NameToId[IdToName.back()] = Id; + return Id; +} + +std::optional StableFunctionMap::getNameForId(unsigned Id) const { + if (Id >= IdToName.size()) + return std::nullopt; + return IdToName[Id]; +} + +void StableFunctionMap::insert(const StableFunction &Func) { + assert(!Finalized && "Cannot insert after finalization"); + auto FuncNameId = getIdOrCreateForName(Func.FunctionName); + auto ModuleNameId = getIdOrCreateForName(Func.ModuleName); + auto IndexOperandHashMap = std::make_unique(); + for (auto &[Index, Hash] : Func.IndexOperandHashes) + (*IndexOperandHashMap)[Index] = Hash; + auto FuncEntry = std::make_unique( + Func.Hash, FuncNameId, ModuleNameId, Func.InstCount, + std::move(IndexOperandHashMap)); + insert(std::move(FuncEntry)); +} + +void StableFunctionMap::merge(const StableFunctionMap &OtherMap) { + assert(!Finalized && "Cannot merge after finalization"); + for (auto &[Hash, Funcs] : OtherMap.HashToFuncs) { + auto &ThisFuncs = HashToFuncs[Hash]; + for (auto &Func : Funcs) { + auto FuncNameId = + getIdOrCreateForName(*OtherMap.getNameForId(Func->FunctionNameId)); + auto ModuleNameId = + getIdOrCreateForName(*OtherMap.getNameForId(Func->ModuleNameId)); + auto ClonedIndexOperandHashMap = + std::make_unique(*Func->IndexOperandHashMap); + ThisFuncs.emplace_back(std::make_unique( + Func->Hash, FuncNameId, ModuleNameId, Func->InstCount, + std::move(ClonedIndexOperandHashMap))); + } + } +} + +size_t StableFunctionMap::size(SizeType Type) const { + switch (Type) { + case UniqueHashCount: + return HashToFuncs.size(); + case TotalFunctionCount: { + size_t Count = 0; + for (auto &Funcs : HashToFuncs) + Count += Funcs.second.size(); + return Count; + } + case MergeableFunctionCount: { + size_t Count = 0; + for (auto &[Hash, Funcs] : HashToFuncs) + if (Funcs.size() >= 2) + Count += Funcs.size(); + return Count; + } + } + llvm_unreachable("Unhandled size type"); +} + +using ParamLocs = SmallVector; +static void removeIdenticalIndexPair( + SmallVector> &SFS) { + auto &RSF = SFS[0]; + unsigned StableFunctionCount = SFS.size(); + + SmallVector ToDelete; + for (auto &[Pair, Hash] : *(RSF->IndexOperandHashMap)) { + bool Identical = true; + for (unsigned J = 1; J < StableFunctionCount; ++J) { + auto &SF = SFS[J]; + const auto &SHash = SF->IndexOperandHashMap->at(Pair); + if (Hash != SHash) { + Identical = false; + break; + } + } + + // No need to parameterize them if the hashes are identical across stable + // functions. + if (Identical) + ToDelete.emplace_back(Pair); + } + + for (auto &Pair : ToDelete) + for (auto &SF : SFS) + SF->IndexOperandHashMap->erase(Pair); +} + +void StableFunctionMap::finalize() { + for (auto It = HashToFuncs.begin(); It != HashToFuncs.end(); ++It) { + auto &[StableHash, SFS] = *It; + + // Group stable functions by ModuleIdentifier. + std::stable_sort(SFS.begin(), SFS.end(), + [&](const std::unique_ptr &L, + const std::unique_ptr &R) { + return *getNameForId(L->ModuleNameId) < + *getNameForId(R->ModuleNameId); + }); + + // Consider the first function as the root function. + auto &RSF = SFS[0]; + + bool Invalid = false; + unsigned StableFunctionCount = SFS.size(); + for (unsigned I = 1; I < StableFunctionCount; ++I) { + auto &SF = SFS[I]; + assert(RSF->Hash == SF->Hash); + if (RSF->InstCount != SF->InstCount) { + Invalid = true; + break; + } + if (RSF->IndexOperandHashMap->size() != SF->IndexOperandHashMap->size()) { + Invalid = true; + break; + } + for (auto &P : *RSF->IndexOperandHashMap) { + auto &InstOpndIndex = P.first; + if (!SF->IndexOperandHashMap->count(InstOpndIndex)) { + Invalid = true; + break; + } + } + } + if (Invalid) { + HashToFuncs.erase(It); + continue; + } + + // Trim the index pair that has the same operand hash across + // stable functions. + removeIdenticalIndexPair(SFS); + } + + Finalized = true; +} diff --git a/llvm/lib/CGData/StableFunctionMapRecord.cpp b/llvm/lib/CGData/StableFunctionMapRecord.cpp new file mode 100644 index 0000000000000..8eb667a651ebe --- /dev/null +++ b/llvm/lib/CGData/StableFunctionMapRecord.cpp @@ -0,0 +1,202 @@ +//===-- StableFunctionMapRecord.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This implements the functionality for the StableFunctionMapRecord class, +// including methods for serialization and deserialization of stable function +// maps to and from raw and YAML streams. It also includes utilities for +// managing function entries and their metadata. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CGData/StableFunctionMapRecord.h" +#include "llvm/Support/EndianStream.h" + +#define DEBUG_TYPE "stable-function-map-record" + +using namespace llvm; +using namespace llvm::support; + +LLVM_YAML_IS_SEQUENCE_VECTOR(IndexPairHash) +LLVM_YAML_IS_SEQUENCE_VECTOR(StableFunction) + +namespace llvm { +namespace yaml { + +template <> struct MappingTraits { + static void mapping(IO &IO, IndexPairHash &Key) { + IO.mapRequired("InstIndex", Key.first.first); + IO.mapRequired("OpndIndex", Key.first.second); + IO.mapRequired("OpndHash", Key.second); + } +}; + +template <> struct MappingTraits { + static void mapping(IO &IO, StableFunction &Func) { + IO.mapRequired("Hash", Func.Hash); + IO.mapRequired("FunctionName", Func.FunctionName); + IO.mapRequired("ModuleName", Func.ModuleName); + IO.mapRequired("InstCount", Func.InstCount); + IO.mapRequired("IndexOperandHashes", Func.IndexOperandHashes); + } +}; + +} // namespace yaml +} // namespace llvm + +// Get a sorted vector of StableFunctionEntry pointers. +static SmallVector +getStableFunctionEntries(const StableFunctionMap &SFM) { + SmallVector FuncEntries; + for (const auto &P : SFM.getFunctionMap()) + for (auto &Func : P.second) + FuncEntries.emplace_back(Func.get()); + + std::stable_sort( + FuncEntries.begin(), FuncEntries.end(), [&](auto &A, auto &B) { + return std::tuple(A->Hash, SFM.getNameForId(A->ModuleNameId), + SFM.getNameForId(A->FunctionNameId)) < + std::tuple(B->Hash, SFM.getNameForId(B->ModuleNameId), + SFM.getNameForId(B->FunctionNameId)); + }); + return FuncEntries; +} + +// Get a sorted vector of IndexOperandHashes. +static IndexOperandHashVecType getStableIndexOperandHashes( + const StableFunctionMap::StableFunctionEntry *FuncEntry) { + IndexOperandHashVecType IndexOperandHashes; + for (auto &[Indices, OpndHash] : *FuncEntry->IndexOperandHashMap) + IndexOperandHashes.emplace_back(Indices, OpndHash); + // The indices are unique, so we can just sort by the first. + llvm::sort(IndexOperandHashes); + return IndexOperandHashes; +} + +void StableFunctionMapRecord::serialize(raw_ostream &OS) const { + serialize(OS, FunctionMap.get()); +} + +void StableFunctionMapRecord::serialize(raw_ostream &OS, + const StableFunctionMap *FunctionMap) { + support::endian::Writer Writer(OS, endianness::little); + + // Write Names. + auto &Names = FunctionMap->getNames(); + uint32_t ByteSize = 4; + Writer.write(Names.size()); + for (auto &Name : Names) { + Writer.OS << Name << '\0'; + ByteSize += Name.size() + 1; + } + // Align ByteSize to 4 bytes. + uint32_t Padding = offsetToAlignment(ByteSize, Align(4)); + for (uint32_t I = 0; I < Padding; ++I) + Writer.OS << '\0'; + + // Write StableFunctionEntries whose pointers are sorted. + auto FuncEntries = getStableFunctionEntries(*FunctionMap); + Writer.write(FuncEntries.size()); + + for (const auto *FuncRef : FuncEntries) { + Writer.write(FuncRef->Hash); + Writer.write(FuncRef->FunctionNameId); + Writer.write(FuncRef->ModuleNameId); + Writer.write(FuncRef->InstCount); + + // Emit IndexOperandHashes sorted from IndexOperandHashMap. + IndexOperandHashVecType IndexOperandHashes = + getStableIndexOperandHashes(FuncRef); + Writer.write(IndexOperandHashes.size()); + for (auto &IndexOperandHash : IndexOperandHashes) { + Writer.write(IndexOperandHash.first.first); + Writer.write(IndexOperandHash.first.second); + Writer.write(IndexOperandHash.second); + } + } +} + +void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr) { + // Assert that Ptr is 4-byte aligned + assert(((uintptr_t)Ptr % 4) == 0); + // Read Names. + auto NumNames = + endian::readNext(Ptr); + // Early exit if there is no name. + if (NumNames == 0) + return; + for (unsigned I = 0; I < NumNames; ++I) { + StringRef Name(reinterpret_cast(Ptr)); + Ptr += Name.size() + 1; + FunctionMap->getIdOrCreateForName(Name); + } + // Align Ptr to 4 bytes. + Ptr = reinterpret_cast(alignAddr(Ptr, Align(4))); + + // Read StableFunctionEntries. + auto NumFuncs = + endian::readNext(Ptr); + for (unsigned I = 0; I < NumFuncs; ++I) { + auto Hash = + endian::readNext(Ptr); + auto FunctionNameId = + endian::readNext(Ptr); + assert(FunctionMap->getNameForId(FunctionNameId) && + "FunctionNameId out of range"); + auto ModuleNameId = + endian::readNext(Ptr); + assert(FunctionMap->getNameForId(ModuleNameId) && + "ModuleNameId out of range"); + auto InstCount = + endian::readNext(Ptr); + + // Read IndexOperandHashes to build IndexOperandHashMap + auto NumIndexOperandHashes = + endian::readNext(Ptr); + auto IndexOperandHashMap = std::make_unique(); + for (unsigned J = 0; J < NumIndexOperandHashes; ++J) { + auto InstIndex = + endian::readNext(Ptr); + auto OpndIndex = + endian::readNext(Ptr); + auto OpndHash = + endian::readNext(Ptr); + assert(InstIndex < InstCount && "InstIndex out of range"); + + IndexOperandHashMap->try_emplace({InstIndex, OpndIndex}, OpndHash); + } + + // Insert a new StableFunctionEntry into the map. + auto FuncEntry = std::make_unique( + Hash, FunctionNameId, ModuleNameId, InstCount, + std::move(IndexOperandHashMap)); + + FunctionMap->insert(std::move(FuncEntry)); + } +} + +void StableFunctionMapRecord::serializeYAML(yaml::Output &YOS) const { + auto FuncEntries = getStableFunctionEntries(*FunctionMap); + SmallVector Functions; + for (const auto *FuncEntry : FuncEntries) { + auto IndexOperandHashes = getStableIndexOperandHashes(FuncEntry); + Functions.emplace_back( + FuncEntry->Hash, *FunctionMap->getNameForId(FuncEntry->FunctionNameId), + *FunctionMap->getNameForId(FuncEntry->ModuleNameId), + FuncEntry->InstCount, std::move(IndexOperandHashes)); + } + + YOS << Functions; +} + +void StableFunctionMapRecord::deserializeYAML(yaml::Input &YIS) { + std::vector Funcs; + YIS >> Funcs; + for (auto &Func : Funcs) + FunctionMap->insert(Func); + YIS.nextDocument(); +} diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 4ea71c9bd4ad4..459ad15163ae5 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1774,6 +1774,12 @@ void AsmPrinter::emitFunctionBody() { bool IsEHa = MMI->getModule()->getModuleFlag("eh-asynch"); bool CanDoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + // Create a slot for the entry basic block section so that the section + // order is preserved when iterating over MBBSectionRanges. + if (!MF->empty()) + MBBSectionRanges[MF->front().getSectionID()] = + MBBSectionRange{CurrentFnBegin, nullptr}; + for (auto &MBB : *MF) { // Print a label for the basic block. emitBasicBlockStart(MBB); @@ -2052,11 +2058,8 @@ void AsmPrinter::emitFunctionBody() { } for (auto &Handler : Handlers) Handler->markFunctionEnd(); - - assert(!MBBSectionRanges.contains(MF->front().getSectionID()) && - "Overwrite section range"); - MBBSectionRanges[MF->front().getSectionID()] = - MBBSectionRange{CurrentFnBegin, CurrentFnEnd}; + // Update the end label of the entry block's section. + MBBSectionRanges[MF->front().getSectionID()].EndLabel = CurrentFnEnd; // Print out jump tables referenced by the function. emitJumpTableInfo(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 10736305762d2..98f8f67383a37 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -34,6 +34,7 @@ #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h" #include "llvm/DebugInfo/DWARF/DWARFExpression.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" @@ -1776,18 +1777,14 @@ bool DwarfDebug::buildLocationList(SmallVectorImpl &DebugLoc, // span each individual section in the range from StartLabel to EndLabel. if (Asm->MF->hasBBSections() && StartLabel == Asm->getFunctionBegin() && !Instr->getParent()->sameSection(&Asm->MF->front())) { - const MCSymbol *BeginSectionLabel = StartLabel; - - for (const MachineBasicBlock &MBB : *Asm->MF) { - if (MBB.isBeginSection() && &MBB != &Asm->MF->front()) - BeginSectionLabel = MBB.getSymbol(); - - if (MBB.sameSection(Instr->getParent())) { - DebugLoc.emplace_back(BeginSectionLabel, EndLabel, Values); + for (const auto &[MBBSectionId, MBBSectionRange] : + Asm->MBBSectionRanges) { + if (Instr->getParent()->getSectionID() == MBBSectionId) { + DebugLoc.emplace_back(MBBSectionRange.BeginLabel, EndLabel, Values); break; } - if (MBB.isEndSection()) - DebugLoc.emplace_back(BeginSectionLabel, MBB.getEndSymbol(), Values); + DebugLoc.emplace_back(MBBSectionRange.BeginLabel, + MBBSectionRange.EndLabel, Values); } } else { DebugLoc.emplace_back(StartLabel, EndLabel, Values); @@ -1828,22 +1825,27 @@ bool DwarfDebug::buildLocationList(SmallVectorImpl &DebugLoc, RangeMBB = &Asm->MF->front(); else RangeMBB = Entries.begin()->getInstr()->getParent(); + auto RangeIt = Asm->MBBSectionRanges.find(RangeMBB->getSectionID()); + assert(RangeIt != Asm->MBBSectionRanges.end() && + "Range MBB not found in MBBSectionRanges!"); auto *CurEntry = DebugLoc.begin(); auto *NextEntry = std::next(CurEntry); + auto NextRangeIt = std::next(RangeIt); while (NextEntry != DebugLoc.end()) { - // Get the last machine basic block of this section. - while (!RangeMBB->isEndSection()) - RangeMBB = RangeMBB->getNextNode(); - if (!RangeMBB->getNextNode()) + if (NextRangeIt == Asm->MBBSectionRanges.end()) return false; // CurEntry should end the current section and NextEntry should start // the next section and the Values must match for these two ranges to be - // merged. - if (CurEntry->getEndSym() != RangeMBB->getEndSymbol() || - NextEntry->getBeginSym() != RangeMBB->getNextNode()->getSymbol() || + // merged. Do not match the section label end if it is the entry block + // section. This is because the end label for the Debug Loc and the + // Function end label could be different. + if ((RangeIt->second.EndLabel != Asm->getFunctionEnd() && + CurEntry->getEndSym() != RangeIt->second.EndLabel) || + NextEntry->getBeginSym() != NextRangeIt->second.BeginLabel || CurEntry->getValues() != NextEntry->getValues()) return false; - RangeMBB = RangeMBB->getNextNode(); + RangeIt = NextRangeIt; + NextRangeIt = std::next(RangeIt); CurEntry = NextEntry; NextEntry = std::next(CurEntry); } diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 3d4e2cb196a16..0aff4f1f5cf1c 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -98,7 +98,7 @@ class AtomicExpandImpl { IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref PerformOp, - CreateCmpXchgInstFun CreateCmpXchg); + CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc); bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); @@ -194,6 +194,39 @@ static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) { return DL.getTypeStoreSize(CASI->getCompareOperand()->getType()); } +/// Copy metadata that's safe to preserve when widening atomics. +static void copyMetadataForAtomic(Instruction &Dest, + const Instruction &Source) { + SmallVector, 8> MD; + Source.getAllMetadata(MD); + LLVMContext &Ctx = Dest.getContext(); + MDBuilder MDB(Ctx); + + for (auto [ID, N] : MD) { + switch (ID) { + case LLVMContext::MD_dbg: + case LLVMContext::MD_tbaa: + case LLVMContext::MD_tbaa_struct: + case LLVMContext::MD_alias_scope: + case LLVMContext::MD_noalias: + case LLVMContext::MD_noalias_addrspace: + case LLVMContext::MD_access_group: + case LLVMContext::MD_mmra: + Dest.setMetadata(ID, N); + break; + default: + if (ID == Ctx.getMDKindID("amdgpu.no.remote.memory")) + Dest.setMetadata(ID, N); + else if (ID == Ctx.getMDKindID("amdgpu.no.fine.grained.memory")) + Dest.setMetadata(ID, N); + + // Losing amdgpu.ignore.denormal.mode, but it doesn't matter for current + // uses. + break; + } + } +} + // Determine if a particular atomic operation has a supported size, // and is of appropriate alignment, to be passed through for target // lowering. (Versus turning into a __atomic libcall) @@ -600,7 +633,8 @@ void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) { static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr, Value *Loaded, Value *NewVal, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, - Value *&Success, Value *&NewLoaded) { + Value *&Success, Value *&NewLoaded, + Instruction *MetadataSrc) { Type *OrigTy = NewVal->getType(); // This code can go away when cmpxchg supports FP and vector types. @@ -612,9 +646,12 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr, Loaded = Builder.CreateBitCast(Loaded, IntTy); } - Value *Pair = Builder.CreateAtomicCmpXchg( + AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, AddrAlign, MemOpOrder, AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID); + if (MetadataSrc) + copyMetadataForAtomic(*Pair, *MetadataSrc); + Success = Builder.CreateExtractValue(Pair, 1, "success"); NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); @@ -951,9 +988,9 @@ void AtomicExpandImpl::expandPartwordAtomicRMW( Value *OldResult; if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) { - OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, - PMV.AlignedAddrAlignment, MemOpOrder, SSID, - PerformPartwordOp, createCmpXchgInstFun); + OldResult = insertRMWCmpXchgLoop( + Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment, + MemOpOrder, SSID, PerformPartwordOp, createCmpXchgInstFun, AI); } else { assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC); OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr, @@ -966,36 +1003,6 @@ void AtomicExpandImpl::expandPartwordAtomicRMW( AI->eraseFromParent(); } -/// Copy metadata that's safe to preserve when widening atomics. -static void copyMetadataForAtomic(Instruction &Dest, - const Instruction &Source) { - SmallVector, 8> MD; - Source.getAllMetadata(MD); - LLVMContext &Ctx = Dest.getContext(); - MDBuilder MDB(Ctx); - - for (auto [ID, N] : MD) { - switch (ID) { - case LLVMContext::MD_dbg: - case LLVMContext::MD_tbaa: - case LLVMContext::MD_tbaa_struct: - case LLVMContext::MD_alias_scope: - case LLVMContext::MD_noalias: - case LLVMContext::MD_access_group: - case LLVMContext::MD_mmra: - Dest.setMetadata(ID, N); - break; - default: - if (ID == Ctx.getMDKindID("amdgpu.no.remote.memory")) - Dest.setMetadata(ID, N); - else if (ID == Ctx.getMDKindID("amdgpu.no.fine.grained.memory")) - Dest.setMetadata(ID, N); - - break; - } - } -} - // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width. AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) { ReplacementIRBuilder Builder(AI, *DL); @@ -1591,7 +1598,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop( IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref PerformOp, - CreateCmpXchgInstFun CreateCmpXchg) { + CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) { LLVMContext &Ctx = Builder.getContext(); BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); @@ -1637,7 +1644,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop( MemOpOrder == AtomicOrdering::Unordered ? AtomicOrdering::Monotonic : MemOpOrder, - SSID, Success, NewLoaded); + SSID, Success, NewLoaded, MetadataSrc); assert(Success && NewLoaded); Loaded->addIncoming(NewLoaded, LoopBB); @@ -1686,7 +1693,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded, AI->getValOperand()); }, - CreateCmpXchg); + CreateCmpXchg, /*MetadataSrc=*/AI); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); @@ -1838,11 +1845,15 @@ void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) { expandAtomicRMWToCmpXchg( I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded, Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder, - SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) { + SyncScope::ID SSID, Value *&Success, Value *&NewLoaded, + Instruction *MetadataSrc) { // Create the CAS instruction normally... AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, Alignment, MemOpOrder, AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID); + if (MetadataSrc) + copyMetadataForAtomic(*Pair, *MetadataSrc); + Success = Builder.CreateExtractValue(Pair, 1, "success"); NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 5224a6c8d1a37..f1ac3d95a8dd8 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1981,17 +1981,36 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI, return true; } -static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem, - const LoopInfo *LI, - Value *&RemAmtOut, - PHINode *&LoopIncrPNOut) { +static bool isRemOfLoopIncrementWithLoopInvariant( + Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut, Value *&AddInstOut, + Value *&AddOffsetOut, PHINode *&LoopIncrPNOut) { Value *Incr, *RemAmt; // NB: If RemAmt is a power of 2 it *should* have been transformed by now. if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt)))) return false; + Value *AddInst, *AddOffset; // Find out loop increment PHI. auto *PN = dyn_cast(Incr); + if (PN != nullptr) { + AddInst = nullptr; + AddOffset = nullptr; + } else { + // Search through a NUW add on top of the loop increment. + Value *V0, *V1; + if (!match(Incr, m_NUWAdd(m_Value(V0), m_Value(V1)))) + return false; + + AddInst = Incr; + PN = dyn_cast(V0); + if (PN != nullptr) { + AddOffset = V1; + } else { + PN = dyn_cast(V1); + AddOffset = V0; + } + } + if (!PN) return false; @@ -2031,6 +2050,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem, // Set output variables. RemAmtOut = RemAmt; LoopIncrPNOut = PN; + AddInstOut = AddInst; + AddOffsetOut = AddOffset; return true; } @@ -2045,15 +2066,14 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem, // Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant; // for(i = Start; i < End; ++i, ++rem) // Rem = rem == RemAmtLoopInvariant ? 0 : Rem; -// -// Currently only implemented for `IncrLoopInvariant` being zero. static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL, const LoopInfo *LI, SmallSet &FreshBBs, bool IsHuge) { - Value *RemAmt; + Value *AddOffset, *RemAmt, *AddInst; PHINode *LoopIncrPN; - if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, LoopIncrPN)) + if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, AddInst, + AddOffset, LoopIncrPN)) return false; // Only non-constant remainder as the extra IV is probably not profitable @@ -2071,6 +2091,23 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL, Loop *L = LI->getLoopFor(LoopIncrPN->getParent()); Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader()); + // If we have add create initial value for remainder. + // The logic here is: + // (urem (add nuw Start, IncrLoopInvariant), RemAmtLoopInvariant + // + // Only proceed if the expression simplifies (otherwise we can't fully + // optimize out the urem). + if (AddInst) { + assert(AddOffset && "We found an add but missing values"); + // Without dom-condition/assumption cache we aren't likely to get much out + // of a context instruction. + Start = simplifyAddInst(Start, AddOffset, + match(AddInst, m_NSWAdd(m_Value(), m_Value())), + /*IsNUW=*/true, *DL); + if (!Start) + return false; + } + // If we can't fully optimize out the `rem`, skip this transform. Start = simplifyURemInst(Start, RemAmt, *DL); if (!Start) @@ -2098,9 +2135,12 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL, FreshBBs.insert(LoopIncrPN->getParent()); FreshBBs.insert(L->getLoopLatch()); FreshBBs.insert(Rem->getParent()); - + if (AddInst) + FreshBBs.insert(cast(AddInst)->getParent()); replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge); Rem->eraseFromParent(); + if (AddInst && AddInst->use_empty()) + cast(AddInst)->eraseFromParent(); return true; } diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b7ddf9f479ef8..7c1bda2163b7a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1110,6 +1110,9 @@ void CombinerHelper::applySextInRegOfLoad( Builder.buildLoadInstr(TargetOpcode::G_SEXTLOAD, MI.getOperand(0).getReg(), LoadDef->getPointerReg(), *NewMMO); MI.eraseFromParent(); + + // Not all loads can be deleted, so make sure the old one is removed. + LoadDef->eraseFromParent(); } /// Return true if 'MI' is a load or a store that may be fold it's address diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 5381dce58f9e6..a87754389cc8e 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -340,20 +340,17 @@ bool IRTranslator::translateCompare(const User &U, Register Op1 = getOrCreateVReg(*U.getOperand(1)); Register Res = getOrCreateVReg(U); CmpInst::Predicate Pred = CI->getPredicate(); + uint32_t Flags = MachineInstr::copyFlagsFromInstruction(*CI); if (CmpInst::isIntPredicate(Pred)) - MIRBuilder.buildICmp(Pred, Res, Op0, Op1); + MIRBuilder.buildICmp(Pred, Res, Op0, Op1, Flags); else if (Pred == CmpInst::FCMP_FALSE) MIRBuilder.buildCopy( Res, getOrCreateVReg(*Constant::getNullValue(U.getType()))); else if (Pred == CmpInst::FCMP_TRUE) MIRBuilder.buildCopy( Res, getOrCreateVReg(*Constant::getAllOnesValue(U.getType()))); - else { - uint32_t Flags = 0; - if (CI) - Flags = MachineInstr::copyFlagsFromInstruction(*CI); + else MIRBuilder.buildFCmp(Pred, Res, Op0, Op1, Flags); - } return true; } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp index 8fe48195c610b..b7541effafe5c 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp @@ -49,6 +49,17 @@ LegalityPredicate LegalityPredicates::typePairInSet( }; } +LegalityPredicate LegalityPredicates::typeTupleInSet( + unsigned TypeIdx0, unsigned TypeIdx1, unsigned TypeIdx2, + std::initializer_list> TypesInit) { + SmallVector, 4> Types = TypesInit; + return [=](const LegalityQuery &Query) { + std::tuple Match = { + Query.Types[TypeIdx0], Query.Types[TypeIdx1], Query.Types[TypeIdx2]}; + return llvm::is_contained(Types, Match); + }; +} + LegalityPredicate LegalityPredicates::typePairAndMemDescInSet( unsigned TypeIdx0, unsigned TypeIdx1, unsigned MMOIdx, std::initializer_list TypesAndMemDescInit) { @@ -202,7 +213,7 @@ LegalityPredicate LegalityPredicates::memSizeNotByteSizePow2(unsigned MMOIdx) { LegalityPredicate LegalityPredicates::numElementsNotPow2(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; - return QueryTy.isVector() && !isPowerOf2_32(QueryTy.getNumElements()); + return QueryTy.isFixedVector() && !isPowerOf2_32(QueryTy.getNumElements()); }; } diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 59f2fc633f5de..15b9164247846 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -898,8 +898,9 @@ MachineIRBuilder::buildFPTrunc(const DstOp &Res, const SrcOp &Op, MachineInstrBuilder MachineIRBuilder::buildICmp(CmpInst::Predicate Pred, const DstOp &Res, const SrcOp &Op0, - const SrcOp &Op1) { - return buildInstr(TargetOpcode::G_ICMP, Res, {Pred, Op0, Op1}); + const SrcOp &Op1, + std::optional Flags) { + return buildInstr(TargetOpcode::G_ICMP, Res, {Pred, Op0, Op1}, Flags); } MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred, diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index dcbbb0871a844..5cee07461d7e2 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -287,7 +287,8 @@ std::optional llvm::getIConstantVRegVal(Register VReg, return ValAndVReg->Value; } -APInt llvm::getIConstantFromReg(Register Reg, const MachineRegisterInfo &MRI) { +const APInt &llvm::getIConstantFromReg(Register Reg, + const MachineRegisterInfo &MRI) { MachineInstr *Const = MRI.getVRegDef(Reg); assert((Const && Const->getOpcode() == TargetOpcode::G_CONSTANT) && "expected a G_CONSTANT on Reg"); diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp index 4c6c8c600ee2b..9a939d06946df 100644 --- a/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/llvm/lib/CodeGen/GlobalMerge.cpp @@ -202,12 +202,13 @@ class GlobalMerge : public FunctionPass { explicit GlobalMerge(const TargetMachine *TM, unsigned MaximalOffset, bool OnlyOptimizeForSize, bool MergeExternalGlobals, - bool MergeConstantGlobals) + bool MergeConstantGlobals, bool MergeConstAggressive) : FunctionPass(ID), TM(TM) { Opt.MaxOffset = MaximalOffset; Opt.SizeOnly = OnlyOptimizeForSize; Opt.MergeExternal = MergeExternalGlobals; Opt.MergeConstantGlobals = MergeConstantGlobals; + Opt.MergeConstAggressive = MergeConstAggressive; initializeGlobalMergePass(*PassRegistry::getPassRegistry()); } @@ -268,7 +269,7 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl &Globals, }); // If we want to just blindly group all globals together, do so. - if (!GlobalMergeGroupByUse || (GlobalMergeAllConst && isConst)) { + if (!GlobalMergeGroupByUse || (Opt.MergeConstAggressive && isConst)) { BitVector AllGlobals(Globals.size()); AllGlobals.set(); return doMerge(Globals, AllGlobals, M, isConst, AddrSpace); @@ -758,10 +759,14 @@ bool GlobalMergeImpl::run(Module &M) { Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset, bool OnlyOptimizeForSize, bool MergeExternalByDefault, - bool MergeConstantByDefault) { + bool MergeConstantByDefault, + bool MergeConstAggressiveByDefault) { bool MergeExternal = (EnableGlobalMergeOnExternal == cl::BOU_UNSET) ? MergeExternalByDefault : (EnableGlobalMergeOnExternal == cl::BOU_TRUE); bool MergeConstant = EnableGlobalMergeOnConst || MergeConstantByDefault; + bool MergeConstAggressive = GlobalMergeAllConst.getNumOccurrences() > 0 + ? GlobalMergeAllConst + : MergeConstAggressiveByDefault; return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal, - MergeConstant); + MergeConstant, MergeConstAggressive); } diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp index 5a3806ce57335..1c450b05f49e9 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -216,6 +216,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { .Case("exact", MIToken::kw_exact) .Case("nneg", MIToken::kw_nneg) .Case("disjoint", MIToken::kw_disjoint) + .Case("samesign", MIToken::kw_samesign) .Case("nofpexcept", MIToken::kw_nofpexcept) .Case("unpredictable", MIToken::kw_unpredictable) .Case("debug-location", MIToken::kw_debug_location) diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h index 3931da3eaae1d..d7cd06759cfbb 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -77,6 +77,7 @@ struct MIToken { kw_unpredictable, kw_nneg, kw_disjoint, + kw_samesign, kw_debug_location, kw_debug_instr_number, kw_dbg_instr_ref, diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 45847b5830da6..059814c70f828 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -1476,7 +1476,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Token.is(MIToken::kw_noconvergent) || Token.is(MIToken::kw_unpredictable) || Token.is(MIToken::kw_nneg) || - Token.is(MIToken::kw_disjoint)) { + Token.is(MIToken::kw_disjoint) || + Token.is(MIToken::kw_samesign)) { // clang-format on // Mine frame and fast math flags if (Token.is(MIToken::kw_frame_setup)) @@ -1513,6 +1514,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Flags |= MachineInstr::NonNeg; if (Token.is(MIToken::kw_disjoint)) Flags |= MachineInstr::Disjoint; + if (Token.is(MIToken::kw_samesign)) + Flags |= MachineInstr::SameSign; lex(); } diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index a015cd3c2a55f..658bbe0e577e5 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -837,6 +837,8 @@ void MIPrinter::print(const MachineInstr &MI) { OS << "disjoint "; if (MI.getFlag(MachineInstr::NoUSWrap)) OS << "nusw "; + if (MI.getFlag(MachineInstr::SameSign)) + OS << "samesign "; OS << TII->getName(MI.getOpcode()); if (I < E) diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index c1bd0bb5b7162..941861da5c569 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -596,6 +596,11 @@ uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) { MIFlags |= MachineInstr::MIFlag::Disjoint; } + // Copy the samesign flag. + if (const ICmpInst *ICmp = dyn_cast(&I)) + if (ICmp->hasSameSign()) + MIFlags |= MachineInstr::MIFlag::SameSign; + // Copy the exact flag. if (const PossiblyExactOperator *PE = dyn_cast(&I)) if (PE->isExact()) @@ -1770,6 +1775,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << "nneg "; if (getFlag(MachineInstr::Disjoint)) OS << "disjoint "; + if (getFlag(MachineInstr::SameSign)) + OS << "samesign "; // Print the opcode name. if (TII) diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp index ad2037a2c20b5..f0667f7b348c7 100644 --- a/llvm/lib/CodeGen/SafeStack.cpp +++ b/llvm/lib/CodeGen/SafeStack.cpp @@ -898,7 +898,7 @@ class SafeStackLegacyPass : public FunctionPass { bool ShouldPreserveDominatorTree; std::optional LazilyComputedDomTree; - // Do we already have a DominatorTree avaliable from the previous pass? + // Do we already have a DominatorTree available from the previous pass? // Note that we should *NOT* require it, to avoid the case where we end up // not needing it, but the legacy PM would have computed it for us anyways. if (auto *DTWP = getAnalysisIfAvailable()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ceaf5d664131c..7eef09e55101d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -401,6 +401,8 @@ namespace { SDValue PromoteExtend(SDValue Op); bool PromoteLoad(SDValue Op); + SDValue foldShiftToAvg(SDNode *N); + SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode CC); @@ -1210,7 +1212,7 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDNodeFlags NewFlags; if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap()) - NewFlags.setNoUnsignedWrap(true); + NewFlags |= SDNodeFlags::NoUnsignedWrap; if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) @@ -2892,11 +2894,11 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { if (N->getFlags().hasNoUnsignedWrap() && N0->getFlags().hasNoUnsignedWrap() && N0.getOperand(0)->getFlags().hasNoUnsignedWrap()) { - Flags.setNoUnsignedWrap(true); + Flags |= SDNodeFlags::NoUnsignedWrap; if (N->getFlags().hasNoSignedWrap() && N0->getFlags().hasNoSignedWrap() && N0.getOperand(0)->getFlags().hasNoSignedWrap()) - Flags.setNoSignedWrap(true); + Flags |= SDNodeFlags::NoSignedWrap; } SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N1), VT, A, DAG.getConstant(CM, DL, VT), Flags); @@ -2920,12 +2922,12 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { N0->getFlags().hasNoUnsignedWrap() && OMul->getFlags().hasNoUnsignedWrap() && OMul.getOperand(0)->getFlags().hasNoUnsignedWrap()) { - Flags.setNoUnsignedWrap(true); + Flags |= SDNodeFlags::NoUnsignedWrap; if (N->getFlags().hasNoSignedWrap() && N0->getFlags().hasNoSignedWrap() && OMul->getFlags().hasNoSignedWrap() && OMul.getOperand(0)->getFlags().hasNoSignedWrap()) - Flags.setNoSignedWrap(true); + Flags |= SDNodeFlags::NoSignedWrap; } SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N1), VT, A, DAG.getConstant(CM, DL, VT), Flags); @@ -2987,11 +2989,8 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // fold (a+b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && - DAG.haveNoCommonBitsSet(N0, N1)) { - SDNodeFlags Flags; - Flags.setDisjoint(true); - return DAG.getNode(ISD::OR, DL, VT, N0, N1, Flags); - } + DAG.haveNoCommonBitsSet(N0, N1)) + return DAG.getNode(ISD::OR, DL, VT, N0, N1, SDNodeFlags::Disjoint); // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)). if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) { @@ -5354,6 +5353,27 @@ SDValue DAGCombiner::visitAVG(SDNode *N) { DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getAllOnesConstant(DL, VT))); } + // Fold avgfloor((add nw x,y), 1) -> avgceil(x,y) + // Fold avgfloor((add nw x,1), y) -> avgceil(x,y) + if ((Opcode == ISD::AVGFLOORU && hasOperation(ISD::AVGCEILU, VT)) || + (Opcode == ISD::AVGFLOORS && hasOperation(ISD::AVGCEILS, VT))) { + SDValue Add; + if (sd_match(N, + m_c_BinOp(Opcode, + m_AllOf(m_Value(Add), m_Add(m_Value(X), m_Value(Y))), + m_One())) || + sd_match(N, m_c_BinOp(Opcode, + m_AllOf(m_Value(Add), m_Add(m_Value(X), m_One())), + m_Value(Y)))) { + + if (IsSigned && Add->getFlags().hasNoSignedWrap()) + return DAG.getNode(ISD::AVGCEILS, DL, VT, X, Y); + + if (!IsSigned && Add->getFlags().hasNoUnsignedWrap()) + return DAG.getNode(ISD::AVGCEILU, DL, VT, X, Y); + } + } + return SDValue(); } @@ -9556,11 +9576,8 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { // fold (a^b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && - DAG.haveNoCommonBitsSet(N0, N1)) { - SDNodeFlags Flags; - Flags.setDisjoint(true); - return DAG.getNode(ISD::OR, DL, VT, N0, N1, Flags); - } + DAG.haveNoCommonBitsSet(N0, N1)) + return DAG.getNode(ISD::OR, DL, VT, N0, N1, SDNodeFlags::Disjoint); // look for 'add-like' folds: // XOR(N0,MIN_SIGNED_VALUE) == ADD(N0,MIN_SIGNED_VALUE) @@ -10210,7 +10227,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { SDNodeFlags Flags; // Preserve the disjoint flag for Or. if (N0.getOpcode() == ISD::OR && N0->getFlags().hasDisjoint()) - Flags.setDisjoint(true); + Flags |= SDNodeFlags::Disjoint; return DAG.getNode(N0.getOpcode(), DL, VT, Shl0, Shl1, Flags); } } @@ -10635,6 +10652,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { if (SDValue NarrowLoad = reduceLoadWidth(N)) return NarrowLoad; + if (SDValue AVG = foldShiftToAvg(N)) + return AVG; + return SDValue(); } @@ -10889,6 +10909,9 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (SDValue MULH = combineShiftToMULH(N, DL, DAG, TLI)) return MULH; + if (SDValue AVG = foldShiftToAvg(N)) + return AVG; + return SDValue(); } @@ -11402,6 +11425,53 @@ static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS, } } +SDValue DAGCombiner::foldShiftToAvg(SDNode *N) { + const unsigned Opcode = N->getOpcode(); + + // Convert (sr[al] (add n[su]w x, y)) -> (avgfloor[su] x, y) + if (Opcode != ISD::SRA && Opcode != ISD::SRL) + return SDValue(); + + unsigned FloorISD = 0; + auto VT = N->getValueType(0); + bool IsUnsigned = false; + + // Decide wether signed or unsigned. + switch (Opcode) { + case ISD::SRA: + if (!hasOperation(ISD::AVGFLOORS, VT)) + return SDValue(); + FloorISD = ISD::AVGFLOORS; + break; + case ISD::SRL: + IsUnsigned = true; + if (!hasOperation(ISD::AVGFLOORU, VT)) + return SDValue(); + FloorISD = ISD::AVGFLOORU; + break; + default: + return SDValue(); + } + + // Captured values. + SDValue A, B, Add; + + // Match floor average as it is common to both floor/ceil avgs. + if (!sd_match(N, m_BinOp(Opcode, + m_AllOf(m_Value(Add), m_Add(m_Value(A), m_Value(B))), + m_One()))) + return SDValue(); + + // Can't optimize adds that may wrap. + if (IsUnsigned && !Add->getFlags().hasNoUnsignedWrap()) + return SDValue(); + + if (!IsUnsigned && !Add->getFlags().hasNoSignedWrap()) + return SDValue(); + + return DAG.getNode(FloorISD, SDLoc(N), N->getValueType(0), {A, B}); +} + /// Generate Min/Max node SDValue DAGCombiner::combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, @@ -13922,11 +13992,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // fold (sext x) -> (zext x) if the sign bit is known zero. if (!TLI.isSExtCheaperThanZExt(N0.getValueType(), VT) && (!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && - DAG.SignBitIsZero(N0)) { - SDNodeFlags Flags; - Flags.setNonNeg(true); - return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0, Flags); - } + DAG.SignBitIsZero(N0)) + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0, SDNodeFlags::NonNeg); if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; @@ -14807,10 +14874,9 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { uint64_t PtrOff = PtrAdjustmentInBits / 8; SDLoc DL(LN0); // The original load itself didn't wrap, so an offset within it doesn't. - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); - SDValue NewPtr = DAG.getMemBasePlusOffset( - LN0->getBasePtr(), TypeSize::getFixed(PtrOff), DL, Flags); + SDValue NewPtr = + DAG.getMemBasePlusOffset(LN0->getBasePtr(), TypeSize::getFixed(PtrOff), + DL, SDNodeFlags::NoUnsignedWrap); AddToWorklist(NewPtr.getNode()); SDValue Load; diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 281d1578d0173..9c7085cc7e7a8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -1105,6 +1105,9 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, if (Flags.hasDisjoint()) MI->setFlag(MachineInstr::MIFlag::Disjoint); + + if (Flags.hasSameSign()) + MI->setFlag(MachineInstr::MIFlag::SameSign); } // Emit all of the actual operands of this instruction, adding them to the diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 6ba12cfb8c514..142774ef4f2e4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1697,12 +1697,9 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const { SignBit = DAG.getNode(ISD::TRUNCATE, DL, MagVT, SignBit); } - SDNodeFlags Flags; - Flags.setDisjoint(true); - // Store the part with the modified sign and convert back to float. - SDValue CopiedSign = - DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit, Flags); + SDValue CopiedSign = DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit, + SDNodeFlags::Disjoint); return modifySignAsInt(MagAsInt, DL, CopiedSign); } @@ -1773,7 +1770,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node, Tmp1 = DAG.getNode(Opc, dl, VT, SP, Size); // Value if (Alignment > StackAlign) Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-Alignment.value(), dl, VT)); + DAG.getSignedConstant(-Alignment.value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); @@ -2348,75 +2345,6 @@ static bool useSinCos(SDNode *Node) { return false; } -/// Issue libcalls to sincos to compute sin / cos pairs. -void SelectionDAGLegalize::ExpandSinCosLibCall( - SDNode *Node, SmallVectorImpl &Results) { - EVT VT = Node->getValueType(0); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - RTLIB::Libcall LC = RTLIB::getFSINCOS(VT); - - // Find users of the node that store the results (and share input chains). The - // destination pointers can be used instead of creating stack allocations. - SDValue StoresInChain{}; - std::array ResultStores = {nullptr}; - for (SDNode *User : Node->uses()) { - if (!ISD::isNormalStore(User)) - continue; - auto *ST = cast(User); - if (!ST->isSimple() || ST->getAddressSpace() != 0 || - ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty) || - (StoresInChain && ST->getChain() != StoresInChain) || - Node->isPredecessorOf(ST->getChain().getNode())) - continue; - ResultStores[ST->getValue().getResNo()] = ST; - StoresInChain = ST->getChain(); - } - - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry{}; - - // Pass the argument. - Entry.Node = Node->getOperand(0); - Entry.Ty = Ty; - Args.push_back(Entry); - - // Pass the output pointers for sin and cos. - SmallVector ResultPtrs{}; - for (StoreSDNode *ST : ResultStores) { - SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(VT); - Entry.Node = ResultPtr; - Entry.Ty = PointerType::getUnqual(Ty->getContext()); - Args.push_back(Entry); - ResultPtrs.push_back(ResultPtr); - } - - SDLoc DL(Node); - SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode(); - SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), - TLI.getPointerTy(DAG.getDataLayout())); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(DL).setChain(InChain).setLibCallee( - TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee, - std::move(Args)); - - auto [Call, OutChain] = TLI.LowerCallTo(CLI); - - for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) { - MachinePointerInfo PtrInfo; - if (StoreSDNode *ST = ResultStores[ResNo]) { - // Replace store with the library call. - DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); - PtrInfo = ST->getPointerInfo(); - } else { - PtrInfo = MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), - cast(ResultPtr)->getIndex()); - } - SDValue LoadResult = DAG.getLoad(VT, DL, OutChain, ResultPtr, PtrInfo); - Results.push_back(LoadResult); - } -} - SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { SDLoc dl(Node); EVT VT = Node->getValueType(0); @@ -4636,7 +4564,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { break; case ISD::FSINCOS: // Expand into sincos libcall. - ExpandSinCosLibCall(Node, Results); + (void)DAG.expandFSINCOS(Node, Results); break; case ISD::FLOG: case ISD::STRICT_FLOG: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index ee9c95c859376..45487c887b74d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -4674,9 +4674,9 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo, DAG.getNode(ISD::SHL, dl, ShAmtVT, SrlTmp, DAG.getConstant(Log2_32(ShiftUnitInBits), dl, ShAmtVT)); - Flags.setExact(true); - SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, BitOffset, - DAG.getConstant(3, dl, ShAmtVT), Flags); + SDValue ByteOffset = + DAG.getNode(ISD::SRL, dl, ShAmtVT, BitOffset, + DAG.getConstant(3, dl, ShAmtVT), SDNodeFlags::Exact); // And clamp it, because OOB load is an immediate UB, // while shift overflow would have *just* been poison. ByteOffset = DAG.getNode(ISD::AND, dl, ShAmtVT, ByteOffset, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index c80da28b3dc34..8403c98545187 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1191,6 +1191,11 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { RTLIB::REM_PPCF128, Results)) return; + break; + case ISD::FSINCOS: + if (DAG.expandFSINCOS(Node, Results)) + return; + break; case ISD::VECTOR_COMPRESS: Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG)); @@ -1700,11 +1705,8 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) { SDValue ClearedSign = DAG.getNode(ISD::VP_AND, DL, IntVT, Mag, ClearSignMask, Mask, EVL); - SDNodeFlags Flags; - Flags.setDisjoint(true); - SDValue CopiedSign = DAG.getNode(ISD::VP_OR, DL, IntVT, ClearedSign, SignBit, - Mask, EVL, Flags); + Mask, EVL, SDNodeFlags::Disjoint); return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); } @@ -1886,11 +1888,8 @@ SDValue VectorLegalizer::ExpandFCOPYSIGN(SDNode *Node) { APInt::getSignedMaxValue(IntVT.getScalarSizeInBits()), DL, IntVT); SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, Mag, ClearSignMask); - SDNodeFlags Flags; - Flags.setDisjoint(true); - - SDValue CopiedSign = - DAG.getNode(ISD::OR, DL, IntVT, ClearedSign, SignBit, Flags); + SDValue CopiedSign = DAG.getNode(ISD::OR, DL, IntVT, ClearedSign, SignBit, + SDNodeFlags::Disjoint); return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5409ae7d9671c..eccda73548e87 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1381,16 +1381,14 @@ void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT, unsigned IncrementSize = MemVT.getSizeInBits().getKnownMinValue() / 8; if (MemVT.isScalableVector()) { - SDNodeFlags Flags; SDValue BytesIncrement = DAG.getVScale( DL, Ptr.getValueType(), APInt(Ptr.getValueSizeInBits().getFixedValue(), IncrementSize)); MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace()); - Flags.setNoUnsignedWrap(true); if (ScaledOffset) *ScaledOffset += IncrementSize; Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement, - Flags); + SDNodeFlags::NoUnsignedWrap); } else { MPI = N->getPointerInfo().getWithOffset(IncrementSize); // Increment the pointer to the other half. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 5403d787861d4..d5cdd7163d791 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/BinaryFormat/Dwarf.h" @@ -2483,6 +2484,103 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, return Subvectors[0]; } +bool SelectionDAG::expandFSINCOS(SDNode *Node, + SmallVectorImpl &Results) { + EVT VT = Node->getValueType(0); + LLVMContext *Ctx = getContext(); + Type *Ty = VT.getTypeForEVT(*Ctx); + RTLIB::Libcall LC = + RTLIB::getFSINCOS(VT.isVector() ? VT.getVectorElementType() : VT); + + const char *LCName = TLI->getLibcallName(LC); + if (!LC || !LCName) + return false; + + auto getVecDesc = [&]() -> VecDesc const * { + for (bool Masked : {false, true}) { + if (VecDesc const *VD = getLibInfo().getVectorMappingInfo( + LCName, VT.getVectorElementCount(), Masked)) { + return VD; + } + } + return nullptr; + }; + + VecDesc const *VD = nullptr; + if (VT.isVector() && !(VD = getVecDesc())) + return false; + + // Find users of the node that store the results (and share input chains). The + // destination pointers can be used instead of creating stack allocations. + SDValue StoresInChain{}; + std::array ResultStores = {nullptr}; + for (SDNode *User : Node->uses()) { + if (!ISD::isNormalStore(User)) + continue; + auto *ST = cast(User); + if (!ST->isSimple() || ST->getAddressSpace() != 0 || + ST->getAlign() < getDataLayout().getABITypeAlign(Ty->getScalarType()) || + (StoresInChain && ST->getChain() != StoresInChain) || + Node->isPredecessorOf(ST->getChain().getNode())) + continue; + ResultStores[ST->getValue().getResNo()] = ST; + StoresInChain = ST->getChain(); + } + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry{}; + + // Pass the argument. + Entry.Node = Node->getOperand(0); + Entry.Ty = Ty; + Args.push_back(Entry); + + // Pass the output pointers for sin and cos. + SmallVector ResultPtrs{}; + for (StoreSDNode *ST : ResultStores) { + SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(VT); + Entry.Node = ResultPtr; + Entry.Ty = PointerType::getUnqual(Ty->getContext()); + Args.push_back(Entry); + ResultPtrs.push_back(ResultPtr); + } + + SDLoc DL(Node); + + if (VD && VD->isMasked()) { + EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *Ctx, VT); + Entry.Node = getBoolConstant(true, DL, MaskVT, VT); + Entry.Ty = MaskVT.getTypeForEVT(*Ctx); + Args.push_back(Entry); + } + + SDValue InChain = StoresInChain ? StoresInChain : getEntryNode(); + SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName, + TLI->getPointerTy(getDataLayout())); + TargetLowering::CallLoweringInfo CLI(*this); + CLI.setDebugLoc(DL).setChain(InChain).setLibCallee( + TLI->getLibcallCallingConv(LC), Type::getVoidTy(*Ctx), Callee, + std::move(Args)); + + auto [Call, OutChain] = TLI->LowerCallTo(CLI); + + for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) { + MachinePointerInfo PtrInfo; + if (StoreSDNode *ST = ResultStores[ResNo]) { + // Replace store with the library call. + ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain); + PtrInfo = ST->getPointerInfo(); + } else { + PtrInfo = MachinePointerInfo::getFixedStack( + getMachineFunction(), cast(ResultPtr)->getIndex()); + } + SDValue LoadResult = getLoad(VT, DL, OutChain, ResultPtr, PtrInfo); + Results.push_back(LoadResult); + } + + return true; +} + SDValue SelectionDAG::expandVAArg(SDNode *Node) { SDLoc dl(Node); const TargetLowering &TLI = getTargetLoweringInfo(); @@ -12377,7 +12475,7 @@ bool SDNode::hasPredecessor(const SDNode *N) const { } void SDNode::intersectFlagsWith(const SDNodeFlags Flags) { - this->Flags.intersectWith(Flags); + this->Flags &= Flags; } SDValue diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 203e80e36b46d..f41dbe81434c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3652,6 +3652,10 @@ void SelectionDAGBuilder::visitICmp(const ICmpInst &I) { Op2 = DAG.getPtrExtOrTrunc(Op2, getCurSDLoc(), MemVT); } + SDNodeFlags Flags; + Flags.setSameSign(I.hasSameSign()); + SelectionDAG::FlagInserter FlagsInserter(DAG, Flags); + EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), I.getType()); setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode)); @@ -4318,7 +4322,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { SDNodeFlags Flags; if (NW.hasNoUnsignedWrap() || (int64_t(Offset) >= 0 && NW.hasNoUnsignedSignedWrap())) - Flags.setNoUnsignedWrap(true); + Flags |= SDNodeFlags::NoUnsignedWrap; N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, DAG.getConstant(Offset, dl, N.getValueType()), Flags); @@ -4484,10 +4488,9 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) { // Round the size of the allocation up to the stack alignment size // by add SA-1 to the size. This doesn't overflow because we're computing // an address inside an alloca. - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize, - DAG.getConstant(StackAlignMask, dl, IntPtr), Flags); + DAG.getConstant(StackAlignMask, dl, IntPtr), + SDNodeFlags::NoUnsignedWrap); // Mask out the low bits for alignment purposes. AllocSize = DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize, @@ -11224,15 +11227,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // An aggregate return value cannot wrap around the address space, so // offsets to its parts don't wrap either. - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); - MachineFunction &MF = CLI.DAG.getMachineFunction(); Align HiddenSRetAlign = MF.getFrameInfo().getObjectAlign(DemoteStackIdx); for (unsigned i = 0; i < NumValues; ++i) { - SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot, - CLI.DAG.getConstant(Offsets[i], CLI.DL, - PtrVT), Flags); + SDValue Add = + CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot, + CLI.DAG.getConstant(Offsets[i], CLI.DL, PtrVT), + SDNodeFlags::NoUnsignedWrap); SDValue L = CLI.DAG.getLoad( RetTys[i], CLI.DL, CLI.Chain, Add, MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 703efb7008974..580ff19065557 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -653,6 +653,9 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { if (getFlags().hasDisjoint()) OS << " disjoint"; + if (getFlags().hasSameSign()) + OS << " samesign"; + if (getFlags().hasNonNeg()) OS << " nneg"; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 981ab18b59c1c..0d99ae9cdebd5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -4224,11 +4224,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, // Set the NoFPExcept flag when no original matched node could // raise an FP exception, but the new node potentially might. - if (!MayRaiseFPException && mayRaiseFPException(Res)) { - SDNodeFlags Flags = Res->getFlags(); - Flags.setNoFPExcept(true); - Res->setFlags(Flags); - } + if (!MayRaiseFPException && mayRaiseFPException(Res)) + Res->setFlags(Res->getFlags() | SDNodeFlags::NoFPExcept); // If the node had chain/glue results, update our notion of the current // chain and glue. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 758b3a5fc526e..1f4ace1b3174d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -808,6 +808,24 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( } break; } + case ISD::SRL: { + // If we are only demanding sign bits then we can use the shift source + // directly. + if (std::optional MaxSA = + DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { + SDValue Op0 = Op.getOperand(0); + unsigned ShAmt = *MaxSA; + // Must already be signbits in DemandedBits bounds, and can't demand any + // shifted in zeroes. + if (DemandedBits.countl_zero() >= ShAmt) { + unsigned NumSignBits = + DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); + if (DemandedBits.countr_zero() >= (BitWidth - NumSignBits)) + return Op0; + } + } + break; + } case ISD::SETCC: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -1486,22 +1504,15 @@ bool TargetLowering::SimplifyDemandedBits( case ISD::OR: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - SDNodeFlags Flags = Op.getNode()->getFlags(); if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO, Depth + 1)) { - if (Flags.hasDisjoint()) { - Flags.setDisjoint(false); - Op->setFlags(Flags); - } + Op->dropFlags(SDNodeFlags::Disjoint); return true; } if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, DemandedElts, Known2, TLO, Depth + 1)) { - if (Flags.hasDisjoint()) { - Flags.setDisjoint(false); - Op->setFlags(Flags); - } + Op->dropFlags(SDNodeFlags::Disjoint); return true; } @@ -1806,14 +1817,9 @@ bool TargetLowering::SimplifyDemandedBits( APInt InDemandedMask = DemandedBits.lshr(ShAmt); if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) { - SDNodeFlags Flags = Op.getNode()->getFlags(); - if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) { - // Disable the nsw and nuw flags. We can no longer guarantee that we - // won't wrap after simplification. - Flags.setNoSignedWrap(false); - Flags.setNoUnsignedWrap(false); - Op->setFlags(Flags); - } + // Disable the nsw and nuw flags. We can no longer guarantee that we + // won't wrap after simplification. + Op->dropFlags(SDNodeFlags::NoWrap); return true; } Known.Zero <<= ShAmt; @@ -1897,14 +1903,9 @@ bool TargetLowering::SimplifyDemandedBits( APInt DemandedFromOp(APInt::getLowBitsSet(BitWidth, BitWidth - CTLZ)); if (SimplifyDemandedBits(Op0, DemandedFromOp, DemandedElts, Known, TLO, Depth + 1)) { - SDNodeFlags Flags = Op.getNode()->getFlags(); - if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) { - // Disable the nsw and nuw flags. We can no longer guarantee that we - // won't wrap after simplification. - Flags.setNoSignedWrap(false); - Flags.setNoUnsignedWrap(false); - Op->setFlags(Flags); - } + // Disable the nsw and nuw flags. We can no longer guarantee that we + // won't wrap after simplification. + Op->dropFlags(SDNodeFlags::NoWrap); return true; } Known.resetAll(); @@ -2456,15 +2457,11 @@ bool TargetLowering::SimplifyDemandedBits( return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src)); } - SDNodeFlags Flags = Op->getFlags(); APInt InDemandedBits = DemandedBits.trunc(InBits); APInt InDemandedElts = DemandedElts.zext(InElts); if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO, Depth + 1)) { - if (Flags.hasNonNeg()) { - Flags.setNonNeg(false); - Op->setFlags(Flags); - } + Op->dropFlags(SDNodeFlags::NonNeg); return true; } assert(Known.getBitWidth() == InBits && "Src width has changed?"); @@ -2528,7 +2525,7 @@ bool TargetLowering::SimplifyDemandedBits( if (!TLO.LegalOperations() || isOperationLegal(Opc, VT)) { SDNodeFlags Flags; if (!IsVecInReg) - Flags.setNonNeg(true); + Flags |= SDNodeFlags::NonNeg; return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src, Flags)); } } @@ -2836,13 +2833,9 @@ bool TargetLowering::SimplifyDemandedBits( DemandedElts, KnownOp0, TLO, Depth + 1) || // See if the operation should be performed at a smaller bit width. ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) { - if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) { - // Disable the nsw and nuw flags. We can no longer guarantee that we - // won't wrap after simplification. - Flags.setNoSignedWrap(false); - Flags.setNoUnsignedWrap(false); - Op->setFlags(Flags); - } + // Disable the nsw and nuw flags. We can no longer guarantee that we + // won't wrap after simplification. + Op->dropFlags(SDNodeFlags::NoWrap); return true; } @@ -2858,12 +2851,10 @@ bool TargetLowering::SimplifyDemandedBits( SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( Op1, LoMask, DemandedElts, TLO.DAG, Depth + 1); if (DemandedOp0 || DemandedOp1) { - Flags.setNoSignedWrap(false); - Flags.setNoUnsignedWrap(false); Op0 = DemandedOp0 ? DemandedOp0 : Op0; Op1 = DemandedOp1 ? DemandedOp1 : Op1; - SDValue NewOp = - TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Flags); + SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, + Flags & ~SDNodeFlags::NoWrap); return TLO.CombineTo(Op, NewOp); } } @@ -2880,9 +2871,8 @@ bool TargetLowering::SimplifyDemandedBits( SDValue Neg1 = TLO.DAG.getAllOnesConstant(dl, VT); // Disable the nsw and nuw flags. We can no longer guarantee that we // won't wrap after simplification. - Flags.setNoSignedWrap(false); - Flags.setNoUnsignedWrap(false); - SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Neg1, Flags); + SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Neg1, + Flags & ~SDNodeFlags::NoWrap); return TLO.CombineTo(Op, NewOp); } @@ -6157,9 +6147,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N, SDValue Res = Op0; if (UseSRA) { - SDNodeFlags Flags; - Flags.setExact(true); - Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags); + Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, SDNodeFlags::Exact); Created.push_back(Res.getNode()); } @@ -6220,9 +6208,7 @@ static SDValue BuildExactUDIV(const TargetLowering &TLI, SDNode *N, SDValue Res = N->getOperand(0); if (UseSRL) { - SDNodeFlags Flags; - Flags.setExact(true); - Res = DAG.getNode(ISD::SRL, dl, VT, Res, Shift, Flags); + Res = DAG.getNode(ISD::SRL, dl, VT, Res, Shift, SDNodeFlags::Exact); Created.push_back(Res.getNode()); } @@ -8447,9 +8433,7 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node, SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred); // Copy FMF flags, but always set the no-signed-zeros flag // as this is implied by the FMINNUM/FMAXNUM semantics. - SDNodeFlags Flags = Node->getFlags(); - Flags.setNoSignedZeros(true); - SelCC->setFlags(Flags); + SelCC->setFlags(Node->getFlags() | SDNodeFlags::NoSignedZeros); return SelCC; } @@ -11805,10 +11789,8 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node, // Re-write the last ValI if all lanes were selected. Otherwise, // overwrite the last write it with the passthru value. - SDNodeFlags Flags{}; - Flags.setUnpredictable(true); LastWriteVal = DAG.getSelect(DL, ScalarVT, AllLanesSelected, ValI, - LastWriteVal, Flags); + LastWriteVal, SDNodeFlags::Unpredictable); Chain = DAG.getStore( Chain, DL, LastWriteVal, OutPtr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction())); diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index 07dfbc41e79b0..c2780faee403d 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -902,11 +902,6 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1, Constant *ExtractIdx = ConstantInt::get(Ty, i); Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx); Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx); - - // If any element of a divisor vector is zero, the whole op is poison. - if (Instruction::isIntDivRem(Opcode) && RHS->isNullValue()) - return PoisonValue::get(VTy); - Constant *Res = ConstantExpr::isDesirableBinOp(Opcode) ? ConstantExpr::get(Opcode, LHS, RHS) : ConstantFoldBinaryInstruction(Opcode, LHS, RHS); diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 488554c84c1c4..3f28dd39911f7 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -298,7 +298,9 @@ static cl::opt UseLoopVersioningLICM( static cl::opt InstrumentColdFuncOnlyPath( "instrument-cold-function-only-path", cl::init(""), - cl::desc("File path for cold function only instrumentation"), cl::Hidden); + cl::desc("File path for cold function only instrumentation(requires use " + "with --pgo-instrument-cold-function-only)"), + cl::Hidden); extern cl::opt UseCtxProfile; extern cl::opt PGOInstrumentColdFunctionOnly; @@ -1135,7 +1137,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // post link pipeline after ICP. This is to enable usage of the type // tests in ICP sequences. if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) - MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, + lowertypetests::DropTestKind::Assume)); invokePipelineEarlySimplificationEPCallbacks(MPM, Level); @@ -1187,10 +1190,13 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, const bool IsCtxProfUse = !UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPreLink; - // Enable cold function coverage instrumentation if - // InstrumentColdFuncOnlyPath is provided. - const bool IsColdFuncOnlyInstrGen = PGOInstrumentColdFunctionOnly = - IsPGOPreLink && !InstrumentColdFuncOnlyPath.empty(); + assert( + (InstrumentColdFuncOnlyPath.empty() || PGOInstrumentColdFunctionOnly) && + "--instrument-cold-function-only-path is provided but " + "--pgo-instrument-cold-function-only is not enabled"); + const bool IsColdFuncOnlyInstrGen = PGOInstrumentColdFunctionOnly && + IsPGOPreLink && + !InstrumentColdFuncOnlyPath.empty(); if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen || IsCtxProfUse || IsColdFuncOnlyInstrGen) @@ -1644,6 +1650,13 @@ PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO, MPM.addPass(buildLTOPreLinkDefaultPipeline(Level)); MPM.addPass(EmbedBitcodePass(ThinLTO, EmitSummary)); + // If we're doing FatLTO w/ CFI enabled, we don't want the type tests in the + // object code, only in the bitcode section, so drop it before we run + // module optimization and generate machine code. If llvm.type.test() isn't in + // the IR, this won't do anything. + MPM.addPass( + LowerTypeTestsPass(nullptr, nullptr, lowertypetests::DropTestKind::All)); + // Use the ThinLTO post-link pipeline with sample profiling if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) MPM.addPass(buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr)); @@ -1750,7 +1763,8 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline( if (Level == OptimizationLevel::O0) { // Run a second time to clean up any type tests left behind by WPD for use // in ICP. - MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, + lowertypetests::DropTestKind::Assume)); // Drop available_externally and unreferenced globals. This is necessary // with ThinLTO in order to avoid leaving undefined references to dead // globals in the object file. @@ -1801,7 +1815,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); // Run a second time to clean up any type tests left behind by WPD for use // in ICP. - MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, + lowertypetests::DropTestKind::Assume)); invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); @@ -1879,7 +1894,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // Run a second time to clean up any type tests left behind by WPD for use // in ICP (which is performed earlier than this in the regular LTO // pipeline). - MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, + lowertypetests::DropTestKind::Assume)); invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); @@ -2060,7 +2076,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); // Run a second time to clean up any type tests left behind by WPD for use // in ICP (which is performed earlier than this in the regular LTO pipeline). - MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); + MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, + lowertypetests::DropTestKind::Assume)); // Enable splitting late in the FullLTO post-link pipeline. if (EnableHotColdSplit) diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index f09241681b92a..0ab9f942a0858 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/OnDiskHashTable.h" #include "llvm/Support/raw_ostream.h" #include +#include #include #include #include diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index c1b0fdbc077bb..634f27f57b00a 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -93,7 +93,7 @@ void llvm::initTimerOptions() { *SortTimers; } -std::unique_ptr llvm::CreateInfoOutputFile() { +std::unique_ptr llvm::CreateInfoOutputFile() { const std::string &OutputFilename = getLibSupportInfoOutputFilename(); if (OutputFilename.empty()) return std::make_unique(2, false); // stderr. diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 6d2dd0ecbccf3..e79457f925db6 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -169,6 +169,11 @@ class AArch64AsmPrinter : public AsmPrinter { // adrp-add followed by PAC sign) void LowerMOVaddrPAC(const MachineInstr &MI); + // Emit the sequence for LOADgotAUTH (load signed pointer from signed ELF GOT + // and authenticate it with, if FPAC bit is not set, check+trap sequence after + // authenticating) + void LowerLOADgotAUTH(const MachineInstr &MI); + /// tblgen'erated driver function for lowering simple MI->MC /// pseudo instructions. bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst); @@ -873,6 +878,22 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) { OutStreamer->addBlankLine(); } + + // With signed ELF GOT enabled, the linker looks at the symbol type to + // choose between keys IA (for STT_FUNC) and DA (for other types). Symbols + // for functions not defined in the module have STT_NOTYPE type by default. + // This makes linker to emit signing schema with DA key (instead of IA) for + // corresponding R_AARCH64_AUTH_GLOB_DAT dynamic reloc. To avoid that, force + // all function symbols used in the module to have STT_FUNC type. See + // https://github.com/ARM-software/abi-aa/blob/main/pauthabielf64/pauthabielf64.rst#default-signing-schema + const auto *PtrAuthELFGOTFlag = mdconst::extract_or_null( + M.getModuleFlag("ptrauth-elf-got")); + if (PtrAuthELFGOTFlag && PtrAuthELFGOTFlag->getZExtValue() == 1) + for (const GlobalValue &GV : M.global_values()) + if (!GV.use_empty() && isa(GV) && + !GV.getName().starts_with("llvm.")) + OutStreamer->emitSymbolAttribute(getSymbol(&GV), + MCSA_ELF_TypeFunction); } // Emit stack and fault map information. @@ -2068,6 +2089,10 @@ void AArch64AsmPrinter::LowerLOADauthptrstatic(const MachineInstr &MI) { void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) { const bool IsGOTLoad = MI.getOpcode() == AArch64::LOADgotPAC; + const bool IsELFSignedGOT = MI.getParent() + ->getParent() + ->getInfo() + ->hasELFSignedGOT(); MachineOperand GAOp = MI.getOperand(0); const uint64_t KeyC = MI.getOperand(1).getImm(); assert(KeyC <= AArch64PACKey::LAST && @@ -2084,9 +2109,17 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) { // Emit: // target materialization: // - via GOT: - // adrp x16, :got:target - // ldr x16, [x16, :got_lo12:target] - // add offset to x16 if offset != 0 + // - unsigned GOT: + // adrp x16, :got:target + // ldr x16, [x16, :got_lo12:target] + // add offset to x16 if offset != 0 + // - ELF signed GOT: + // adrp x17, :got:target + // add x17, x17, :got_auth_lo12:target + // ldr x16, [x17] + // aut{i|d}a x16, x17 + // check+trap sequence (if no FPAC) + // add offset to x16 if offset != 0 // // - direct: // adrp x16, target @@ -2129,13 +2162,48 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) { MCInstLowering.lowerOperand(GAMOLo, GAMCLo); EmitToStreamer( - MCInstBuilder(AArch64::ADRP).addReg(AArch64::X16).addOperand(GAMCHi)); + MCInstBuilder(AArch64::ADRP) + .addReg(IsGOTLoad && IsELFSignedGOT ? AArch64::X17 : AArch64::X16) + .addOperand(GAMCHi)); if (IsGOTLoad) { - EmitToStreamer(MCInstBuilder(AArch64::LDRXui) - .addReg(AArch64::X16) - .addReg(AArch64::X16) - .addOperand(GAMCLo)); + if (IsELFSignedGOT) { + EmitToStreamer(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X17) + .addReg(AArch64::X17) + .addOperand(GAMCLo) + .addImm(0)); + + EmitToStreamer(MCInstBuilder(AArch64::LDRXui) + .addReg(AArch64::X16) + .addReg(AArch64::X17) + .addImm(0)); + + assert(GAOp.isGlobal()); + assert(GAOp.getGlobal()->getValueType() != nullptr); + unsigned AuthOpcode = GAOp.getGlobal()->getValueType()->isFunctionTy() + ? AArch64::AUTIA + : AArch64::AUTDA; + + EmitToStreamer(MCInstBuilder(AuthOpcode) + .addReg(AArch64::X16) + .addReg(AArch64::X16) + .addReg(AArch64::X17)); + + if (!STI->hasFPAC()) { + auto AuthKey = (AuthOpcode == AArch64::AUTIA ? AArch64PACKey::IA + : AArch64PACKey::DA); + + emitPtrauthCheckAuthenticatedValue(AArch64::X16, AArch64::X17, AuthKey, + /*ShouldTrap=*/true, + /*OnFailure=*/nullptr); + } + } else { + EmitToStreamer(MCInstBuilder(AArch64::LDRXui) + .addReg(AArch64::X16) + .addReg(AArch64::X16) + .addOperand(GAMCLo)); + } } else { EmitToStreamer(MCInstBuilder(AArch64::ADDXri) .addReg(AArch64::X16) @@ -2203,6 +2271,69 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) { EmitToStreamer(MIB); } +void AArch64AsmPrinter::LowerLOADgotAUTH(const MachineInstr &MI) { + Register DstReg = MI.getOperand(0).getReg(); + Register AuthResultReg = STI->hasFPAC() ? DstReg : AArch64::X16; + const MachineOperand &GAMO = MI.getOperand(1); + assert(GAMO.getOffset() == 0); + + MachineOperand GAHiOp(GAMO); + MachineOperand GALoOp(GAMO); + GAHiOp.addTargetFlag(AArch64II::MO_PAGE); + GALoOp.addTargetFlag(AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + + MCOperand GAMCHi, GAMCLo; + MCInstLowering.lowerOperand(GAHiOp, GAMCHi); + MCInstLowering.lowerOperand(GALoOp, GAMCLo); + + EmitToStreamer( + MCInstBuilder(AArch64::ADRP).addReg(AArch64::X17).addOperand(GAMCHi)); + + EmitToStreamer(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X17) + .addReg(AArch64::X17) + .addOperand(GAMCLo) + .addImm(0)); + + EmitToStreamer(MCInstBuilder(AArch64::LDRXui) + .addReg(AuthResultReg) + .addReg(AArch64::X17) + .addImm(0)); + + assert(GAMO.isGlobal()); + MCSymbol *UndefWeakSym; + if (GAMO.getGlobal()->hasExternalWeakLinkage()) { + UndefWeakSym = createTempSymbol("undef_weak"); + EmitToStreamer( + MCInstBuilder(AArch64::CBZX) + .addReg(AuthResultReg) + .addExpr(MCSymbolRefExpr::create(UndefWeakSym, OutContext))); + } + + assert(GAMO.getGlobal()->getValueType() != nullptr); + unsigned AuthOpcode = GAMO.getGlobal()->getValueType()->isFunctionTy() + ? AArch64::AUTIA + : AArch64::AUTDA; + EmitToStreamer(MCInstBuilder(AuthOpcode) + .addReg(AuthResultReg) + .addReg(AuthResultReg) + .addReg(AArch64::X17)); + + if (GAMO.getGlobal()->hasExternalWeakLinkage()) + OutStreamer->emitLabel(UndefWeakSym); + + if (!STI->hasFPAC()) { + auto AuthKey = + (AuthOpcode == AArch64::AUTIA ? AArch64PACKey::IA : AArch64PACKey::DA); + + emitPtrauthCheckAuthenticatedValue(AuthResultReg, AArch64::X17, AuthKey, + /*ShouldTrap=*/true, + /*OnFailure=*/nullptr); + + emitMovXReg(DstReg, AuthResultReg); + } +} + const MCExpr * AArch64AsmPrinter::lowerBlockAddressConstant(const BlockAddress &BA) { const MCExpr *BAE = AsmPrinter::lowerBlockAddressConstant(BA); @@ -2381,6 +2512,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { LowerMOVaddrPAC(*MI); return; + case AArch64::LOADgotAUTH: + LowerLOADgotAUTH(*MI); + return; + case AArch64::BRA: case AArch64::BLRA: emitPtrauthBranch(MI); diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 6c874fcabcc30..d1e5d10829d55 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -454,6 +454,9 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO()) return 0; + if (FuncInfo.MF->getInfo()->hasELFSignedGOT()) + return 0; + unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); EVT DestEVT = TLI.getValueType(DL, GV->getType(), true); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 31a720ed7b5c7..0814380b18848 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7927,10 +7927,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, Ptr.getValueType()); } - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - BytesIncrement, Flags); + BytesIncrement, SDNodeFlags::NoUnsignedWrap); ExtraArgLocs++; i++; } @@ -8986,12 +8984,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL, Ptr.getValueType()); } - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); - MPI = MachinePointerInfo(MPI.getAddrSpace()); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - BytesIncrement, Flags); + BytesIncrement, SDNodeFlags::NoUnsignedWrap); ExtraArgLocs++; i++; } @@ -9604,6 +9599,11 @@ SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes instead of using a wrapper node. + if (DAG.getMachineFunction() + .getInfo() + ->hasELFSignedGOT()) + return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr), + 0); return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); } @@ -11777,8 +11777,7 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags; - Flags.setAllowReassociation(true); + SDNodeFlags Flags = SDNodeFlags::AllowReassociation; // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) @@ -11807,8 +11806,7 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, SDLoc DL(Operand); EVT VT = Operand.getValueType(); - SDNodeFlags Flags; - Flags.setAllowReassociation(true); + SDNodeFlags Flags = SDNodeFlags::AllowReassociation; // Newton reciprocal iteration: E * (2 - X * E) // AArch64 reciprocal iteration instruction: (2 - M * N) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 457e918728ae2..424848252f6aa 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1942,8 +1942,15 @@ let Predicates = [HasPAuth] in { Sched<[WriteI, ReadI]> { let isReMaterializable = 1; let isCodeGenOnly = 1; - let Size = 40; // 12 fixed + 28 variable, for pointer offset, and discriminator - let Defs = [X16,X17]; + let Size = 68; // 12 fixed + 56 variable, for pointer offset, discriminator and + // ELF signed GOT signed pointer authentication (if no FPAC) + let Defs = [X16,X17,NZCV]; + } + + def LOADgotAUTH : Pseudo<(outs GPR64common:$dst), (ins i64imm:$addr), []>, + Sched<[WriteI, ReadI]> { + let Defs = [X16,X17,NZCV]; + let Size = 44; } // Load a signed global address from a special $auth_ptr$ stub slot. @@ -7227,8 +7234,23 @@ def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2) )>; -multiclass Neon_INS_elt_pattern { +// Move elements between vectors +multiclass Neon_INS_elt_pattern { + // Extracting from the lowest 128-bits of an SVE vector + def : Pat<(VT128 (vector_insert VT128:$Rn, + (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))), + (i64 imm:$Immd))), + (INS VT128:$Rn, imm:$Immd, (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn)>; + + def : Pat<(VT64 (vector_insert VT64:$Rn, + (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))), + (i64 imm:$Immd))), + (EXTRACT_SUBREG + (INS (SUBREG_TO_REG (i64 0), VT64:$Rn, dsub), imm:$Immd, + (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn), + dsub)>; + // Extracting from another NEON vector def : Pat<(VT128 (vector_insert V128:$src, (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))), (i64 imm:$Immd))), @@ -7256,15 +7278,15 @@ multiclass Neon_INS_elt_pattern; } -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; // Insert from bitcast // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0) diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 48672241f905d..9f234b0f91705 100644 --- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64MCInstLower.h" +#include "AArch64MachineFunctionInfo.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/CodeGen/AsmPrinter.h" @@ -185,9 +186,12 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, MCSymbol *Sym) const { uint32_t RefFlags = 0; - if (MO.getTargetFlags() & AArch64II::MO_GOT) - RefFlags |= AArch64MCExpr::VK_GOT; - else if (MO.getTargetFlags() & AArch64II::MO_TLS) { + if (MO.getTargetFlags() & AArch64II::MO_GOT) { + const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + RefFlags |= (MF->getInfo()->hasELFSignedGOT() + ? AArch64MCExpr::VK_GOT_AUTH + : AArch64MCExpr::VK_GOT); + } else if (MO.getTargetFlags() & AArch64II::MO_TLS) { TLSModel::Model Model; if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index e96c5a953ff2b..f08506979a1ae 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -72,6 +72,18 @@ static bool ShouldSignWithBKey(const Function &F, const AArch64Subtarget &STI) { return Key == "b_key"; } +static bool hasELFSignedGOTHelper(const Function &F, + const AArch64Subtarget *STI) { + if (!Triple(STI->getTargetTriple()).isOSBinFormatELF()) + return false; + const Module *M = F.getParent(); + const auto *Flag = mdconst::extract_or_null( + M->getModuleFlag("ptrauth-elf-got")); + if (Flag && Flag->getZExtValue() == 1) + return true; + return false; +} + AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI) { // If we already know that the function doesn't have a redzone, set @@ -80,6 +92,7 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, HasRedZone = false; std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F); SignWithBKey = ShouldSignWithBKey(F, *STI); + HasELFSignedGOT = hasELFSignedGOTHelper(F, STI); // TODO: skip functions that have no instrumented allocas for optimization IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag); diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 85b9733e95c52..a77fdaf19bcf5 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -177,6 +177,11 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// SignWithBKey modifies the default PAC-RET mode to signing with the B key. bool SignWithBKey = false; + /// HasELFSignedGOT is true if the target binary format is ELF and the IR + /// module containing the corresponding function has "ptrauth-elf-got" flag + /// set to 1. + bool HasELFSignedGOT = false; + /// SigningInstrOffset captures the offset of the PAC-RET signing instruction /// within the prologue, so it can be re-used for authentication in the /// epilogue when using PC as a second salt (FEAT_PAuth_LR) @@ -509,6 +514,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool shouldSignWithBKey() const { return SignWithBKey; } + bool hasELFSignedGOT() const { return HasELFSignedGOT; } + MCSymbol *getSigningInstrLabel() const { return SignInstrLabel; } void setSigningInstrLabel(MCSymbol *Label) { SignInstrLabel = Label; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 8516ab2c7dd71..4117d74d10c1e 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -19,18 +19,24 @@ class AArch64Reg enc, string n, list subregs = [], } let Namespace = "AArch64" in { + // SubRegIndexes for GPR registers def sub_32 : SubRegIndex<32>; + def sube64 : SubRegIndex<64>; + def subo64 : SubRegIndex<64>; + def sube32 : SubRegIndex<32>; + def subo32 : SubRegIndex<32>; + // SubRegIndexes for FPR/Vector registers def bsub : SubRegIndex<8>; def hsub : SubRegIndex<16>; def ssub : SubRegIndex<32>; def dsub : SubRegIndex<64>; - def sube32 : SubRegIndex<32>; - def subo32 : SubRegIndex<32>; - def sube64 : SubRegIndex<64>; - def subo64 : SubRegIndex<64>; - // SVE - def zsub : SubRegIndex<128>; + def zsub : SubRegIndex<128>; + // Note: Code depends on these having consecutive numbers + def zsub0 : SubRegIndex<128, -1>; + def zsub1 : SubRegIndex<128, -1>; + def zsub2 : SubRegIndex<128, -1>; + def zsub3 : SubRegIndex<128, -1>; // Note: Code depends on these having consecutive numbers def dsub0 : SubRegIndex<64>; def dsub1 : SubRegIndex<64>; @@ -41,7 +47,8 @@ let Namespace = "AArch64" in { def qsub1 : SubRegIndex<128>; def qsub2 : SubRegIndex<128>; def qsub3 : SubRegIndex<128>; - // Note: Code depends on these having consecutive numbers + + // SubRegIndexes for SME Matrix tiles def zasubb : SubRegIndex<2048>; // (16 x 16)/1 bytes = 2048 bits def zasubh0 : SubRegIndex<1024>; // (16 x 16)/2 bytes = 1024 bits def zasubh1 : SubRegIndex<1024>; // (16 x 16)/2 bytes = 1024 bits @@ -52,7 +59,11 @@ let Namespace = "AArch64" in { def zasubq0 : SubRegIndex<128>; // (16 x 16)/16 bytes = 128 bits def zasubq1 : SubRegIndex<128>; // (16 x 16)/16 bytes = 128 bits - def psub : SubRegIndex<16>; + // SubRegIndexes for SVE Predicates + def psub : SubRegIndex<16>; + // Note: Code depends on these having consecutive numbers + def psub0 : SubRegIndex<16, -1>; + def psub1 : SubRegIndex<16, -1>; } let Namespace = "AArch64" in { @@ -1026,11 +1037,6 @@ def PNR16_p8to15 : PNRP8to15RegOp<"h", PNRAsmOp16_p8to15, 16, PNR_p8to15>; def PNR32_p8to15 : PNRP8to15RegOp<"s", PNRAsmOp32_p8to15, 32, PNR_p8to15>; def PNR64_p8to15 : PNRP8to15RegOp<"d", PNRAsmOp64_p8to15, 64, PNR_p8to15>; -let Namespace = "AArch64" in { - def psub0 : SubRegIndex<16, -1>; - def psub1 : SubRegIndex<16, -1>; -} - class PPRorPNRClass : RegisterClass< "AArch64", [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1, aarch64svcount ], 16, @@ -1123,8 +1129,7 @@ let EncoderMethod = "EncodeRegMul_MinMax<2, 0, 14>", } // end let EncoderMethod/DecoderMethod -//****************************************************************************** - +//===----------------------------------------------------------------------===// // SVE vector register classes class ZPRClass : RegisterClass<"AArch64", [nxv16i8, nxv8i16, nxv4i32, nxv2i64, @@ -1245,13 +1250,6 @@ def FPR32asZPR : FPRasZPROperand<32>; def FPR64asZPR : FPRasZPROperand<64>; def FPR128asZPR : FPRasZPROperand<128>; -let Namespace = "AArch64" in { - def zsub0 : SubRegIndex<128, -1>; - def zsub1 : SubRegIndex<128, -1>; - def zsub2 : SubRegIndex<128, -1>; - def zsub3 : SubRegIndex<128, -1>; -} - // Pairs, triples, and quads of SVE vector registers. def ZSeqPairs : RegisterTuples<[zsub0, zsub1], [(rotl ZPR, 0), (rotl ZPR, 1)]>; def ZSeqTriples : RegisterTuples<[zsub0, zsub1, zsub2], [(rotl ZPR, 0), (rotl ZPR, 1), (rotl ZPR, 2)]>; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index b716529428894..42f17747b9401 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -131,6 +131,42 @@ defm USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa", int_aarch64_sme defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme_usmops_wide>; } +let Predicates = [HasSME2p2] in { + defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a">; + defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s">; + defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a">; + defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s">; + defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a">; + defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s">; + defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a">; + defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s">; + + defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a">; + defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s">; + defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a">; + defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s">; +} + +let Predicates = [HasSME2p2, HasSMEI16I64] in { + defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a">; + defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s">; + defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a">; + defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s">; + defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a">; + defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s">; + defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a">; + defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s">; +} + +let Predicates = [HasSME2p2] in { +def STMOPA_M2ZZZI_BtoS : sme_int_sparse_outer_product_i32<0b00100, ZZ_b_mul_r, ZPR8, "stmopa">; +def STMOPA_M2ZZZI_HtoS : sme_int_sparse_outer_product_i32<0b00101, ZZ_h_mul_r, ZPR16, "stmopa">; +def UTMOPA_M2ZZZI_BtoS : sme_int_sparse_outer_product_i32<0b11100, ZZ_b_mul_r, ZPR8, "utmopa">; +def UTMOPA_M2ZZZI_HtoS : sme_int_sparse_outer_product_i32<0b10101, ZZ_h_mul_r, ZPR16, "utmopa">; +def SUTMOPA_M2ZZZI_BtoS : sme_int_sparse_outer_product_i32<0b01100, ZZ_b_mul_r, ZPR8, "sutmopa">; +def USTMOPA_M2ZZZI_BtoS : sme_int_sparse_outer_product_i32<0b10100, ZZ_b_mul_r, ZPR8, "ustmopa">; +} + let Predicates = [HasSME] in { //===----------------------------------------------------------------------===// // Loads and stores @@ -1017,6 +1053,12 @@ let Predicates = [HasSME2p2] in { defm FMUL_2Z2Z : sme2_multi2_fmul_mm< "fmul">; defm FMUL_4ZZ : sme2_multi4_fmul_sm<"fmul">; defm FMUL_4Z4Z : sme2_multi4_fmul_mm< "fmul">; + + defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a">; + defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s">; + + defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a">; + defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s">; } // [HasSME2p2] let Predicates = [HasSME2p2, HasSMEB16B16] in { @@ -1047,3 +1089,13 @@ let Predicates = [HasSME2p2, HasSMEF8F32] in { defm FMOP4A : sme2_fmop4a_fp8_fp32_4way<"fmop4a">; } } + +let Predicates = [HasSME2p2, HasSMEB16B16] in { + defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a">; + defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s">; +} + +let Predicates = [HasSME2p2, HasSMEF64F64] in { + defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a">; + defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s">; +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index d6662d15617fa..6fdcaec86340c 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3366,6 +3366,21 @@ let Predicates = [HasSVEorSME] in { (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>; def : Pat<(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)), (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>; + + // Move element from the bottom 128-bits of a scalable vector to a single-element vector. + // Alternative case where insertelement is just scalar_to_vector rather than vector_insert. + def : Pat<(v1f64 (scalar_to_vector + (f64 (vector_extract nxv2f64:$vec, VectorIndexD:$index)))), + (EXTRACT_SUBREG + (INSvi64lane (IMPLICIT_DEF), (i64 0), + (EXTRACT_SUBREG nxv2f64:$vec, zsub), VectorIndexD:$index), + dsub)>; + def : Pat<(v1i64 (scalar_to_vector + (i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)))), + (EXTRACT_SUBREG + (INSvi64lane (IMPLICIT_DEF), (i64 0), + (EXTRACT_SUBREG nxv2i64:$vec, zsub), VectorIndexD:$index), + dsub)>; } // End HasNEON let Predicates = [HasNEON] in { @@ -4307,6 +4322,13 @@ let Predicates = [HasSVE2p2orSME2p2] in { // SVE predicate count defm FIRSTP_XPP : sve_int_pcount_pred_tmp<0b001, "firstp">; defm LASTP_XPP : sve_int_pcount_pred_tmp<0b010, "lastp">; + + // SVE reverse within elements, zeroing predicate + defm RBIT_ZPzZ : sve_int_perm_rev_rbit_z<"rbit">; + defm REVB_ZPzZ : sve_int_perm_rev_revb_z<"revb">; + defm REVH_ZPzZ : sve_int_perm_rev_revh_z<"revh">; + def REVW_ZPzZ : sve_int_perm_rev_z<0b11, 0b0110, "revw", ZPR64>; + def REVD_ZPzZ : sve_int_perm_rev_z<0b00, 0b1110, "revd", ZPR128>; } // End HasSME2p2orSVE2p2 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 7fb2a961e0313..736d57e6ae2fd 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -86,6 +86,13 @@ static cl::alias AArch64StreamingStackHazardSize( cl::desc("alias for -aarch64-streaming-hazard-size"), cl::aliasopt(AArch64StreamingHazardSize)); +// Subreg liveness tracking is disabled by default for now until all issues +// are ironed out. This option allows the feature to be used in tests. +static cl::opt + EnableSubregLivenessTracking("aarch64-enable-subreg-liveness-tracking", + cl::init(false), cl::Hidden, + cl::desc("Enable subreg liveness tracking")); + unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) return OverrideVectorInsertExtractBaseCost; @@ -380,6 +387,8 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, ReserveXRegisterForRA.set(29); AddressCheckPSV.reset(new AddressCheckPseudoSourceValue(TM)); + + EnableSubregLiveness = EnableSubregLivenessTracking.getValue(); } const CallLowering *AArch64Subtarget::getCallLowering() const { diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 50adb7cbf69a8..f3dcce3f3994b 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -90,6 +90,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { unsigned VScaleForTuning = 2; TailFoldingOpts DefaultSVETFOpts = TailFoldingOpts::Disabled; + bool EnableSubregLiveness; + /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; @@ -153,6 +155,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { return usePostRAScheduler(); } + bool enableSubRegLiveness() const override { return EnableSubregLiveness; } bool enableMachinePipeliner() const override; bool useDFAforSMS() const override { return false; } diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 5a487be5723ce..712f6de52941c 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -897,6 +897,7 @@ class AArch64Operand : public MCParsedAsmOperand { if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || ELFRefKind == AArch64MCExpr::VK_LO12 || ELFRefKind == AArch64MCExpr::VK_GOT_LO12 || + ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 || @@ -1008,19 +1009,20 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Addend; if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) { - return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF - || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF - || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) - || ELFRefKind == AArch64MCExpr::VK_LO12 - || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 - || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 - || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC - || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 - || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 - || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC - || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 - || ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 - || ELFRefKind == AArch64MCExpr::VK_SECREL_LO12; + return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || + DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF || + (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) || + ELFRefKind == AArch64MCExpr::VK_LO12 || + ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 || + ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_SECREL_LO12; } // If it's a constant, it should be a real immediate in range. @@ -3309,6 +3311,7 @@ ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE && ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC && ELFRefKind != AArch64MCExpr::VK_GOT_PAGE && + ELFRefKind != AArch64MCExpr::VK_GOT_AUTH_PAGE && ELFRefKind != AArch64MCExpr::VK_GOT_PAGE_LO15 && ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE && ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) { @@ -4428,6 +4431,8 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { .Case("got", AArch64MCExpr::VK_GOT_PAGE) .Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15) .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12) + .Case("got_auth", AArch64MCExpr::VK_GOT_AUTH_PAGE) + .Case("got_auth_lo12", AArch64MCExpr::VK_GOT_AUTH_LO12) .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE) .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC) .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1) @@ -5801,6 +5806,7 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, // Only allow these with ADDXri/ADDWri if ((ELFRefKind == AArch64MCExpr::VK_LO12 || + ELFRefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index afea08ab09250..9502b1d10f9a2 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -2967,7 +2967,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } if (OpFlags & AArch64II::MO_GOT) { - I.setDesc(TII.get(AArch64::LOADgot)); + I.setDesc(TII.get(MF.getInfo()->hasELFSignedGOT() + ? AArch64::LOADgotAUTH + : AArch64::LOADgot)); I.getOperand(1).setTargetFlags(OpFlags); } else if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 6024027afaf6c..f162d1c2973cb 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -978,6 +978,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) .legalIf( typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64, v2p0})) + .legalFor(HasSVE, {{nxv16s8, s32, s64}, + {nxv8s16, s32, s64}, + {nxv4s32, s32, s64}, + {nxv2s64, s64, s64}}) .moreElementsToNextPow2(0) .widenVectorEltsToVectorMinSize(0, 64) .clampNumElements(0, v8s8, v16s8) @@ -1316,6 +1320,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarOrEltToNextPow2(0) .immIdx(0); // Inform verifier imm idx 0 is handled. + // TODO: {nxv16s8, s8}, {nxv8s16, s16} + getActionDefinitionsBuilder(G_SPLAT_VECTOR) + .legalFor(HasSVE, {{nxv4s32, s32}, {nxv2s64, s64}}); + getLegacyLegalizerInfo().computeTables(); verify(*ST.getInstrInfo()); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index b40fe55fdfaf6..2bcfdc1b46873 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -426,6 +426,9 @@ void applyNonConstInsert(MachineInstr &MI, MachineRegisterInfo &MRI, LLT EltTy = MRI.getType(Insert.getElementReg()); LLT IdxTy = MRI.getType(Insert.getIndexReg()); + if (VecTy.isScalableVector()) + return; + // Create a stack slot and store the vector into it MachineFunction &MF = Builder.getMF(); Align Alignment( diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 83aac6fdae729..b5f5a58d96288 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -165,6 +165,15 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, } if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC) return R_CLS(ADR_GOT_PAGE); + if (SymLoc == AArch64MCExpr::VK_GOT_AUTH && !IsNC) { + if (IsILP32) { + Ctx.reportError(Fixup.getLoc(), + "ILP32 ADRP AUTH relocation not supported " + "(LP64 eqv: AUTH_ADR_GOT_PAGE)"); + return ELF::R_AARCH64_NONE; + } + return ELF::R_AARCH64_AUTH_ADR_GOT_PAGE; + } if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC) return R_CLS(TLSIE_ADR_GOTTPREL_PAGE21); if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) @@ -240,6 +249,15 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(TLSLE_ADD_TPREL_LO12); if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12) return R_CLS(TLSDESC_ADD_LO12); + if (RefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 && IsNC) { + if (IsILP32) { + Ctx.reportError(Fixup.getLoc(), + "ILP32 ADD AUTH relocation not supported " + "(LP64 eqv: AUTH_GOT_ADD_LO12_NC)"); + return ELF::R_AARCH64_NONE; + } + return ELF::R_AARCH64_AUTH_GOT_ADD_LO12_NC; + } if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return R_CLS(ADD_ABS_LO12_NC); @@ -332,17 +350,23 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, case AArch64::fixup_aarch64_ldst_imm12_scale8: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return R_CLS(LDST64_ABS_LO12_NC); - if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) { + if ((SymLoc == AArch64MCExpr::VK_GOT || + SymLoc == AArch64MCExpr::VK_GOT_AUTH) && + IsNC) { AArch64MCExpr::VariantKind AddressLoc = AArch64MCExpr::getAddressFrag(RefKind); + bool IsAuth = (SymLoc == AArch64MCExpr::VK_GOT_AUTH); if (!IsILP32) { if (AddressLoc == AArch64MCExpr::VK_LO15) return ELF::R_AARCH64_LD64_GOTPAGE_LO15; - return ELF::R_AARCH64_LD64_GOT_LO12_NC; + return (IsAuth ? ELF::R_AARCH64_AUTH_LD64_GOT_LO12_NC + : ELF::R_AARCH64_LD64_GOT_LO12_NC); } - Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " - "relocation not supported (LP64 eqv: " - "LD64_GOT_LO12_NC)"); + Ctx.reportError(Fixup.getLoc(), + Twine("ILP32 64-bit load/store " + "relocation not supported (LP64 eqv: ") + + (IsAuth ? "AUTH_GOT_LO12_NC" : "LD64_GOT_LO12_NC") + + Twine(')')); return ELF::R_AARCH64_NONE; } if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index fb8eb9f47da17..3430b9002894f 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -30,6 +30,7 @@ const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, VariantKind Kind, } StringRef AArch64MCExpr::getVariantKindName() const { + // clang-format off switch (static_cast(getKind())) { case VK_CALL: return ""; case VK_LO12: return ":lo12:"; @@ -82,9 +83,13 @@ StringRef AArch64MCExpr::getVariantKindName() const { case VK_TLSDESC_PAGE: return ":tlsdesc:"; case VK_SECREL_LO12: return ":secrel_lo12:"; case VK_SECREL_HI12: return ":secrel_hi12:"; + case VK_GOT_AUTH: return ":got_auth:"; + case VK_GOT_AUTH_PAGE: return ":got_auth:"; + case VK_GOT_AUTH_LO12: return ":got_auth_lo12:"; default: llvm_unreachable("Invalid ELF symbol kind"); } + // clang-format on } void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index cf3a90f95a2c1..699992782f67b 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -24,6 +24,7 @@ namespace llvm { class AArch64MCExpr : public MCTargetExpr { public: enum VariantKind { + // clang-format off // Symbol locations specifying (roughly speaking) what calculation should be // performed to construct the final address for the relocated // symbol. E.g. direct, via the GOT, ... @@ -38,6 +39,7 @@ class AArch64MCExpr : public MCTargetExpr { VK_SECREL = 0x009, VK_AUTH = 0x00a, VK_AUTHADDR = 0x00b, + VK_GOT_AUTH = 0x00c, VK_SymLocBits = 0x00f, // Variants specifying which part of the final address calculation is @@ -88,6 +90,8 @@ class AArch64MCExpr : public MCTargetExpr { VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC, VK_GOT_PAGE = VK_GOT | VK_PAGE, VK_GOT_PAGE_LO15 = VK_GOT | VK_LO15 | VK_NC, + VK_GOT_AUTH_LO12 = VK_GOT_AUTH | VK_PAGEOFF | VK_NC, + VK_GOT_AUTH_PAGE = VK_GOT_AUTH | VK_PAGE, VK_DTPREL_G2 = VK_DTPREL | VK_G2, VK_DTPREL_G1 = VK_DTPREL | VK_G1, VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC, @@ -114,6 +118,7 @@ class AArch64MCExpr : public MCTargetExpr { VK_SECREL_HI12 = VK_SECREL | VK_HI12, VK_INVALID = 0xfff + // clang-format on }; private: diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index e7c90b0ed14e0..54e64e3708223 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -390,6 +390,35 @@ multiclass sme_int_outer_product_i64 opc, string mnemonic, def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } +class sme_int_sparse_outer_product_i32 opc, RegisterOperand zn_ty, RegisterOperand zm_ty, string mnemonic> + : I<(outs TileOp32:$ZAda), + (ins TileOp32:$_ZAda, zn_ty:$Zn, zm_ty:$Zm, ZK:$Zk, VectorIndexS32b:$imm), + mnemonic, "\t$ZAda, $Zn, $Zm, $Zk$imm", + "", []>, + Sched<[]> { + bits<2> ZAda; + bits<4> Zn; + bits<5> Zm; + bits<3> Zk; + bits<2> imm; + let Inst{31-25} = 0b1000000; + let Inst{24} = opc{4}; + let Inst{23-22} = 0b01; + let Inst{21} = opc{3}; + let Inst{20-16} = Zm; + let Inst{15} = opc{2}; + let Inst{14} = 0b0; + let Inst{13} = opc{1}; + let Inst{12-10} = Zk; + let Inst{9-6} = Zn; + let Inst{5-4} = imm; + let Inst{3} = opc{0}; + let Inst{2} = 0b0; + let Inst{1-0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; +} + class sme_outer_product_widening_inst opc, ZPRRegOp zpr_ty, string mnemonic> : I<(outs TileOp32:$ZAda), (ins TileOp32:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), @@ -433,6 +462,116 @@ multiclass sme_f16_outer_product opc, string mnemonic, SDPatternOperator def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } +class sme_quarter_outer_product_i64 zn_u_pair, bits<2> zm_u_pair, bit subtr, RegisterOperand zn_ty, RegisterOperand zm_ty, string mnemonic> + : I<(outs TileOp64:$ZAda), + (ins TileOp64:$_ZAda, zn_ty:$Zn, zm_ty:$Zm), + mnemonic, "\t$ZAda, $Zn, $Zm", + "", []>, + Sched<[]> { + bits<3> ZAda; + bits<3> Zn; + bits<3> Zm; + let Inst{31-25} = 0b1010000; + let Inst{24} = zn_u_pair{1}; // u0 + let Inst{23-22} = 0b11; + let Inst{21} = zm_u_pair{1}; // u1 + let Inst{20} = zm_u_pair{0}; // M + let Inst{19-17} = Zm; + let Inst{16-10} = 0b0000000; + let Inst{9} = zn_u_pair{0}; // N + let Inst{8-6} = Zn; + let Inst{5} = 0; + let Inst{4} = subtr; + let Inst{3} = 0b1; + let Inst{2-0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; +} + +class sme_quarter_outer_product_i8_i32 zn_u_pair, bits<2> zm_u_pair, bit subtr, RegisterOperand zn_ty, RegisterOperand zm_ty, string mnemonic> + : I<(outs TileOp32:$ZAda), + (ins TileOp32:$_ZAda, zn_ty:$Zn, zm_ty:$Zm), + mnemonic, "\t$ZAda, $Zn, $Zm", + "", []>, + Sched<[]> { + bits<2> ZAda; + bits<3> Zn; + bits<3> Zm; + let Inst{31-25} = 0b1000000; + let Inst{24} = zn_u_pair{1}; // u0 + let Inst{23-22} = 0b00; + let Inst{21} = zm_u_pair{1}; // u1 + let Inst{20} = zm_u_pair{0}; // M + let Inst{19-17} = Zm; + let Inst{16-10} = 0b0100000; + let Inst{9} = zn_u_pair{0}; // N + let Inst{8-6} = Zn; + let Inst{5} = 0; + let Inst{4} = subtr; + let Inst{3-2} = 0b00; + let Inst{1-0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; +} + +class sme_quarter_outer_product_i16_i32 + : I<(outs TileOp32:$ZAda), + (ins TileOp32:$_ZAda, zn_ty:$Zn, zm_ty:$Zm), + mnemonic, "\t$ZAda, $Zn, $Zm", + "", []>, + Sched<[]> { + bits<2> ZAda; + bits<3> Zn; + bits<3> Zm; + let Inst{31-25} = 0b1000000; + let Inst{24} = u0; + let Inst{23-21} = 0b000; + let Inst{20} = M; + let Inst{19-17} = Zm; + let Inst{16-10} = 0b0100000; + let Inst{9} = N; + let Inst{8-6} = Zn; + let Inst{5} = 0; + let Inst{4} = subtr; + let Inst{3-2} = 0b10; + let Inst{1-0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; +} + +multiclass sme_quarter_outer_product_i8_i32{ + def _MZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 0}, subtr, + ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>; + def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr, + ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>; + def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr, + ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, mnemonic>; + def _M2Z2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 1}, subtr, + ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>; +} + +multiclass sme_quarter_outer_product_i16_i32{ + def _MZZ_HToS : sme_quarter_outer_product_i16_i32; + def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32; + def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32; + def _M2Z2Z_HToS : sme_quarter_outer_product_i16_i32; +} + +multiclass sme_quarter_outer_product_i64{ + def _MZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 0}, subtr, + ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>; + def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr, + ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>; + def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr, + ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>; + def _M2Z2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 1}, subtr, + ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>; +} + //===----------------------------------------------------------------------===// // SME Add Vector to Tile //===----------------------------------------------------------------------===// @@ -5417,3 +5556,151 @@ multiclass sme2_fmop4a_fp8_fp32_4way { // Multiple vectors def _M2Z2Z_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<1, 1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>; } + +class sme2_bf16_fp16_quarter_tile_outer_product + : I<(outs TileOp16:$ZAda), + (ins TileOp16:$_ZAda, zn_ty:$Zn, zm_ty:$Zm), + mnemonic, "\t$ZAda, $Zn, $Zm", + "", []>, Sched<[]> { + bit ZAda; + bits<3> Zn; + bits<3> Zm; + + let Inst{31-21} = 0b10000001001; + let Inst{20} = M; + let Inst{19-17} = Zm; + let Inst{16-10} = 0b0000000; + let Inst{9} = N; + let Inst{8-6} = Zn; + let Inst{5} = 0; + let Inst{4} = S; + let Inst{3-1} = 0b100; + let Inst{0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; +} + +multiclass sme2_bfmop4as_non_widening { + // Single vectors + def _MZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>; + + // Multiple and single vectors + def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + + // Single and multiple vectors + def _MZ2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>; + + // Multiple vectors + def _M2Z2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; +} + +class sme2_fp32_quarter_tile_outer_product + : I<(outs TileOp32:$ZAda), + (ins TileOp32:$_ZAda, zn_ty:$Zn, zm_ty:$Zm), + mnemonic, "\t$ZAda, $Zn, $Zm", + "", []>, Sched<[]> { + bits<2> ZAda; + bits<3> Zn; + bits<3> Zm; + + let Inst{31-21} = 0b10000000000; + let Inst{20} = M; + let Inst{19-17} = Zm; + let Inst{16-10} = 0b0000000; + let Inst{9} = N; + let Inst{8-6} = Zn; + let Inst{5} = 0; + let Inst{4} = S; + let Inst{3-2} = 0b00; + let Inst{1-0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; +} + +multiclass sme2_fmop4as_fp32_non_widening { + // Single vectors + def _MZZ_S : sme2_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR32Mul2_Lo, ZPR32Mul2_Hi>; + + // Multiple and single vectors + def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>; + + // Single and multiple vectors + def _MZ2Z_S : sme2_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR32Mul2_Lo, ZZ_s_mul_r_Hi>; + + // Multiple vectors + def _M2Z2Z_S : sme2_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZZ_s_mul_r_Hi>; +} + +class sme2_fp64_quarter_tile_outer_product + : I<(outs TileOp64:$ZAda), + (ins TileOp64:$_ZAda, zn_ty:$Zn, zm_ty:$Zm), + mnemonic, "\t$ZAda, $Zn, $Zm", + "", []>, Sched<[]> { + bits<3> ZAda; + bits<3> Zn; + bits<3> Zm; + + let Inst{31-21} = 0b10000000110; + let Inst{20} = M; + let Inst{19-17} = Zm; + let Inst{16-10} = 0b0000000; + let Inst{9} = N; + let Inst{8-6} = Zn; + let Inst{5} = 0; + let Inst{4} = S; + let Inst{3} = 0b1; + let Inst{2-0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; +} + +multiclass sme2_fmop4as_fp64_non_widening { + // Single vectors + def _MZZ_D : sme2_fp64_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR64Mul2_Lo, ZPR64Mul2_Hi>; + + // Multiple and single vectors + def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>; + + // Single and multiple vectors + def _MZ2Z_D : sme2_fp64_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR64Mul2_Lo, ZZ_d_mul_r_Hi>; + + // Multiple vectors + def _M2Z2Z_D : sme2_fp64_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZZ_d_mul_r_Hi>; +} + +class sme2_fp16_fp32_quarter_tile_outer_product + : I<(outs TileOp32:$ZAda), + (ins TileOp32:$_ZAda, zn_ty:$Zn, zm_ty:$Zm), + mnemonic, "\t$ZAda, $Zn, $Zm", + "", []>, Sched<[]> { + bits<2> ZAda; + bits<3> Zn; + bits<3> Zm; + + let Inst{31-21} = 0b10000001001; + let Inst{20} = M; + let Inst{19-17} = Zm; + let Inst{16-10} = 0b0000000; + let Inst{9} = N; + let Inst{8-6} = Zn; + let Inst{5} = 0; + let Inst{4} = S; + let Inst{3-2} = 0b00; + let Inst{1-0} = ZAda; + + let Constraints = "$ZAda = $_ZAda"; +} + +multiclass sme2_fmop4as_fp16_fp32_widening { + // Single vectors + def _MZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>; + + // Multiple and single vectors + def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + + // Single and multiple vectors + def _MZ2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>; + + // Multiple vectors + def _M2Z2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; +} diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 552d5b9b23a7e..5cfcc01afd20f 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7429,6 +7429,45 @@ multiclass sve_int_perm_rev_revw { def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } +class sve_int_perm_rev_z sz, bits<4> opc, string asm, + ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn), + asm, "\t$Zd, $Pg/z, $Zn", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<3> Pg; + bits<5> Zn; + let Inst{31-24} = 0b00000101; + let Inst{23-22} = sz; + let Inst{21-20} = 0b10; + let Inst{19-16} = opc; + let Inst{15-13} = 0b101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let hasSideEffects = 0; +} + +multiclass sve_int_perm_rev_rbit_z { + def _B : sve_int_perm_rev_z<0b00, 0b0111, asm, ZPR8>; + def _H : sve_int_perm_rev_z<0b01, 0b0111, asm, ZPR16>; + def _S : sve_int_perm_rev_z<0b10, 0b0111, asm, ZPR32>; + def _D : sve_int_perm_rev_z<0b11, 0b0111, asm, ZPR64>; +} + +multiclass sve_int_perm_rev_revb_z { + def _H : sve_int_perm_rev_z<0b01, 0b0100, asm, ZPR16>; + def _S : sve_int_perm_rev_z<0b10, 0b0100, asm, ZPR32>; + def _D : sve_int_perm_rev_z<0b11, 0b0100, asm, ZPR64>; +} + +multiclass sve_int_perm_rev_revh_z { + def _S : sve_int_perm_rev_z<0b10, 0b0101, asm, ZPR32>; + def _D : sve_int_perm_rev_z<0b11, 0b0101, asm, ZPR64>; +} + class sve_int_perm_cpy_r sz8_64, string asm, ZPRRegOp zprty, RegisterClass srcRegType> : I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegType:$Rn), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 6a69b9d2bfc71..2ae34636005ea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -41,7 +41,7 @@ static cl::opt IndirectCallSpecializationThreshold( #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS, enum ImplicitArgumentPositions { - #include "AMDGPUAttributes.def" +#include "AMDGPUAttributes.def" LAST_ARG_POS }; @@ -49,14 +49,14 @@ enum ImplicitArgumentPositions { enum ImplicitArgumentMask { NOT_IMPLICIT_INPUT = 0, - #include "AMDGPUAttributes.def" +#include "AMDGPUAttributes.def" ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 }; #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, -static constexpr std::pair ImplicitAttrs[] = { - #include "AMDGPUAttributes.def" +static constexpr std::pair + ImplicitAttrs[] = { +#include "AMDGPUAttributes.def" }; // We do not need to note the x workitem or workgroup id because they are always @@ -107,12 +107,12 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, // Under V5, we need implicitarg_ptr + offsets to access private_base or // shared_base. For pre-V5, however, need to access them through queue_ptr + // offsets. - return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR : - QUEUE_PTR; + return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR + : QUEUE_PTR; case Intrinsic::trap: if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. - return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT : - QUEUE_PTR; + return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT + : QUEUE_PTR; NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; default: @@ -180,9 +180,7 @@ class AMDGPUInformationCache : public InformationCache { } /// Get code object version. - unsigned getCodeObjectVersion() const { - return CodeObjectVersion; - } + unsigned getCodeObjectVersion() const { return CodeObjectVersion; } /// Get the effective value of "amdgpu-waves-per-eu" for the function, /// accounting for the interaction with the passed value to use for @@ -707,8 +705,7 @@ struct AAAMDSizeRangeAttribute /// See AbstractAttribute::trackStatistics() void trackStatistics() const override {} - template - ChangeStatus updateImplImpl(Attributor &A) { + template ChangeStatus updateImplImpl(Attributor &A) { ChangeStatus Change = ChangeStatus::UNCHANGED; auto CheckCallSite = [&](AbstractCallSite CS) { @@ -728,7 +725,9 @@ struct AAAMDSizeRangeAttribute }; bool AllCallSitesKnown = true; - if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) + if (!A.checkForAllCallSites(CheckCallSite, *this, + /*RequireAllCallSites=*/true, + AllCallSitesKnown)) return indicatePessimisticFixpoint(); return Change; @@ -747,7 +746,7 @@ struct AAAMDSizeRangeAttribute OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; return A.manifestAttrs(getIRPosition(), {Attribute::get(Ctx, AttrName, OS.str())}, - /* ForceReplace */ true); + /*ForceReplace=*/true); } const std::string getAsStr(Attributor *) const override { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 351e9f25e29cf..ab62e530a18d0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -230,13 +230,6 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { return AddrReg.getReg(0); } - void assignValueToReg(Register ValVReg, Register PhysReg, - const CCValAssign &VA) override { - MIB.addUse(PhysReg, RegState::Implicit); - Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); - MIRBuilder.buildCopy(PhysReg, ExtReg); - } - void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, const MachinePointerInfo &MPO, const CCValAssign &VA) override { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 278d3536add91..d348f489d95dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -207,6 +207,8 @@ def : GINodeEquiv { def : GINodeEquiv { bit CheckMMOIsAtomic = 1; + let IfSignExtend = G_SEXTLOAD; + let IfZeroExtend = G_ZEXTLOAD; } def : GINodeEquiv { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 09987a6504b9d..671070c70f0c4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -504,23 +504,36 @@ def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextloadi16 node:$ptr)> { def atomic_load_8_#as : PatFrag<(ops node:$ptr), (atomic_load_8 node:$ptr)> { let IsAtomic = 1; - let MemoryVT = i8; } def atomic_load_16_#as : PatFrag<(ops node:$ptr), (atomic_load_16 node:$ptr)> { let IsAtomic = 1; - let MemoryVT = i16; } def atomic_load_32_#as : PatFrag<(ops node:$ptr), (atomic_load_32 node:$ptr)> { let IsAtomic = 1; - let MemoryVT = i32; } def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> { let IsAtomic = 1; - let MemoryVT = i64; } + +def atomic_load_zext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_8 node:$ptr)> { + let IsAtomic = 1; +} + +def atomic_load_sext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_sext_8 node:$ptr)> { + let IsAtomic = 1; +} + +def atomic_load_zext_16_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_16 node:$ptr)> { + let IsAtomic = 1; +} + +def atomic_load_sext_16_#as : PatFrag<(ops node:$ptr), (atomic_load_sext_16 node:$ptr)> { + let IsAtomic = 1; +} + } // End let AddressSpaces } // End foreach as diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b35f9faf024bd..d7126132356d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5494,6 +5494,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, .addImm(Src5) .getReg(0); } + case Intrinsic::amdgcn_mov_dpp8: + return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0); default: llvm_unreachable("unhandled lane op"); } @@ -7529,6 +7531,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_set_inactive: case Intrinsic::amdgcn_set_inactive_chain_arg: + case Intrinsic::amdgcn_mov_dpp8: return legalizeLaneOp(Helper, MI, IntrID); case Intrinsic::amdgcn_s_buffer_prefetch_data: return legalizeSBufferPrefetch(Helper, MI); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 6bdff9862e55a..79d6a825f60b0 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -983,15 +983,20 @@ defm BUFFER_LOAD_LDS_U16 : MUBUF_Pseudo_Loads_LDSOpc < >; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_zext_8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_zext_16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, atomic_load_sext_8_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, atomic_load_sext_16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>; +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, atomic_load_sext_16_global>; foreach vt = Reg32Types.types in { defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>; @@ -2541,11 +2546,9 @@ multiclass MUBUF_Real_AllAddr_gfx11_Impl op, bit hasTFE, defm _TFE : MUBUF_Real_AllAddr_gfx11_Impl2; } -multiclass MUBUF_Real_AllAddr_gfx11_gfx12_Impl op, bit hasTFE, - string real_name> { +multiclass MUBUF_Real_AllAddr_gfx11_gfx12_Impl op, string real_name> { defm NAME : MUBUF_Real_AllAddr_gfx11_gfx12_Impl2; - if hasTFE then - defm _TFE : MUBUF_Real_AllAddr_gfx11_gfx12_Impl2; + defm _TFE : MUBUF_Real_AllAddr_gfx11_gfx12_Impl2; } // Non-renamed, non-atomic gfx11/gfx12 mubuf instructions. @@ -2554,7 +2557,7 @@ multiclass MUBUF_Real_AllAddr_gfx11 op, bit hasTFE = 1> : multiclass MUBUF_Real_AllAddr_gfx11_gfx12 op, string real_name = get_BUF_ps.Mnemonic> : - MUBUF_Real_AllAddr_gfx11_gfx12_Impl { + MUBUF_Real_AllAddr_gfx11_gfx12_Impl { defvar ps = get_BUF_ps; if !ne(ps.Mnemonic, real_name) then def : Mnem_gfx11_gfx12; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index e9283fde85a48..7724821bbd7c3 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -795,12 +795,19 @@ defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; let OtherPredicates = [D16PreservesUnusedBits] in { +// TODO: Atomic loads def : DSReadPat_D16; def : DSReadPat_D16; def : DSReadPat_D16; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index a9ab0c5a453e8..db74372e9db45 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1355,11 +1355,17 @@ let OtherPredicates = [HasFlatAddressSpace] in { def : FlatLoadPat ; def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; @@ -1367,6 +1373,7 @@ def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; @@ -1456,6 +1463,7 @@ def : FlatStorePat ; } let OtherPredicates = [D16PreservesUnusedBits] in { +// TODO: Handle atomic loads def : FlatLoadPat_D16 ; def : FlatLoadPat_D16 ; def : FlatLoadPat_D16 ; @@ -1477,8 +1485,14 @@ let OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; @@ -1488,6 +1502,8 @@ defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; foreach vt = Reg32Types.types in { @@ -1525,6 +1541,7 @@ defm : GlobalFLATStorePats ; defm : GlobalFLATLoadPats_D16 ; defm : GlobalFLATLoadPats_D16 ; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 52ca38aca5c77..d66610ae0a160 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/ModRef.h" @@ -3855,10 +3856,14 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned ArgIdx = 0; for (auto [Reg, Val] : RegsToPass) { - if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() && - TRI->isSGPRPhysReg(Reg)) { - // Speculatively insert a readfirstlane in case this is a uniform value in - // a VGPR. + if (ArgIdx++ >= NumSpecialInputs && + (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) { + // For chain calls, the inreg arguments are required to be + // uniform. Speculatively Insert a readfirstlane in case we cannot prove + // they are uniform. + // + // For other calls, if an inreg arguments is known to be uniform, + // speculatively insert a readfirstlane in case it is in a VGPR. // // FIXME: We need to execute this in a waterfall loop if it is a divergent // value, so let that continue to produce invalid code. @@ -3893,9 +3898,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); } else { if (IsTailCall) { - assert(!Callee->isDivergent() && - "cannot tail call a divergent call target"); - // isEligibleForTailCallOptimization considered whether the call target is // divergent, but we may still end up with a uniform value in a VGPR. // Insert a readfirstlane just in case. @@ -6181,6 +6183,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, case Intrinsic::amdgcn_readlane: case Intrinsic::amdgcn_set_inactive: case Intrinsic::amdgcn_set_inactive_chain_arg: + case Intrinsic::amdgcn_mov_dpp8: Operands.push_back(Src1); [[fallthrough]]; case Intrinsic::amdgcn_readfirstlane: @@ -6207,7 +6210,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDValue Src0 = N->getOperand(1); SDValue Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || - IsSetInactive || IsPermLane16) { + IID == Intrinsic::amdgcn_mov_dpp8 || IsSetInactive || IsPermLane16) { Src1 = N->getOperand(2); if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) Src2 = N->getOperand(3); @@ -8833,6 +8836,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_set_inactive: case Intrinsic::amdgcn_set_inactive_chain_arg: + case Intrinsic::amdgcn_mov_dpp8: return lowerLaneOp(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = @@ -16307,12 +16311,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) { : TargetLowering::AtomicExpansionKind::CmpXChg; } +/// Return if a flat address space atomicrmw can access private memory. +static bool flatInstrMayAccessPrivate(const Instruction *I) { + const MDNode *NoaliasAddrSpaceMD = + I->getMetadata(LLVMContext::MD_noalias_addrspace); + if (!NoaliasAddrSpaceMD) + return true; + + for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E; + ++I) { + auto *Low = mdconst::extract( + NoaliasAddrSpaceMD->getOperand(2 * I + 0)); + auto *High = mdconst::extract( + NoaliasAddrSpaceMD->getOperand(2 * I + 1)); + + if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) && + High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS)) + return true; + } + + return false; +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::PRIVATE_ADDRESS) return AtomicExpansionKind::NotAtomic; + // 64-bit flat atomics that dynamically reside in private memory will silently + // be dropped. + // + // Note that we will emit a new copy of the original atomic in the expansion, + // which will be incrementally relegalized. + const DataLayout &DL = RMW->getFunction()->getDataLayout(); + if (AS == AMDGPUAS::FLAT_ADDRESS && + DL.getTypeSizeInBits(RMW->getType()) == 64 && + flatInstrMayAccessPrivate(RMW)) + return AtomicExpansionKind::Expand; + auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { OptimizationRemarkEmitter ORE(RMW->getFunction()); ORE.emit([=]() { @@ -16713,20 +16750,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor) { - // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 - assert(cast(AI->getValOperand())->isNullValue() && - "this cannot be replaced with add"); - AI->setOperation(AtomicRMWInst::Add); - return; + if (auto *ConstVal = dyn_cast(AI->getValOperand()); + ConstVal && ConstVal->isNullValue()) { + // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 + AI->setOperation(AtomicRMWInst::Add); + + // TODO: Turn the below private handling into a no-op for idempotent + // cases. + } } - assert(Subtarget->hasAtomicFaddInsts() && - "target should have atomic fadd instructions"); - assert(AI->getType()->isFloatTy() && - AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && - "generic atomicrmw expansion only supports FP32 operand in flat " - "address space"); - assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now"); + // The non-flat expansions should only perform the de-canonicalization of + // identity values. + if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS) + return; + + // FullFlatEmulation is true if we need to issue the private, shared, and + // global cases. + // + // If this is false, we are only dealing with the flat-targeting-private case, + // where we only insert a check for private and still use the flat instruction + // for global and shared. + + // TODO: Avoid the private check for the fadd case depending on + // noalias.addrspace. + + bool FullFlatEmulation = Op == AtomicRMWInst::FAdd && + Subtarget->hasAtomicFaddInsts() && + AI->getType()->isFloatTy(); // Given: atomicrmw fadd ptr %addr, float %val ordering // @@ -16766,6 +16817,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { // // atomicrmw.end: // [...] + // + // + // For 64-bit atomics which may reside in private memory, we perform a simpler + // version that only inserts the private check, and uses the flat operation. IRBuilder<> Builder(AI); LLVMContext &Ctx = Builder.getContext(); @@ -16777,9 +16832,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Function *F = BB->getParent(); BasicBlock *ExitBB = BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); - BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); - BasicBlock *CheckPrivateBB = - BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + BasicBlock *SharedBB = nullptr; + + BasicBlock *CheckPrivateBB = BB; + if (FullFlatEmulation) { + SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); + CheckPrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + } + BasicBlock *PrivateBB = BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); @@ -16792,23 +16853,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); - CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {}, - {Addr}, nullptr, "is.shared"); - Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); - Builder.SetInsertPoint(SharedBB); - Value *CastToLocal = Builder.CreateAddrSpaceCast( - Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); + Value *LoadedShared = nullptr; + if (FullFlatEmulation) { + CallInst *IsShared = Builder.CreateIntrinsic( + Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared"); + Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); + Builder.SetInsertPoint(SharedBB); + Value *CastToLocal = Builder.CreateAddrSpaceCast( + Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); - Instruction *Clone = AI->clone(); - Clone->insertInto(SharedBB, SharedBB->end()); - Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) - .set(CastToLocal); - Instruction *LoadedShared = Clone; + Instruction *Clone = AI->clone(); + Clone->insertInto(SharedBB, SharedBB->end()); + Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) + .set(CastToLocal); + LoadedShared = Clone; - Builder.CreateBr(PhiBB); + Builder.CreateBr(PhiBB); + Builder.SetInsertPoint(CheckPrivateBB); + } - Builder.SetInsertPoint(CheckPrivateBB); CallInst *IsPrivate = Builder.CreateIntrinsic( Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private"); Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); @@ -16825,15 +16889,32 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.CreateBr(PhiBB); Builder.SetInsertPoint(GlobalBB); - Value *CastToGlobal = Builder.CreateAddrSpaceCast( - Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); - Value *LoadedGlobal = AI; - AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal); + // Continue using a flat instruction if we only emitted the check for private. + Instruction *LoadedGlobal = AI; + if (FullFlatEmulation) { + Value *CastToGlobal = Builder.CreateAddrSpaceCast( + Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); + AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) + .set(CastToGlobal); + } AI->removeFromParent(); AI->insertInto(GlobalBB, GlobalBB->end()); + // The new atomicrmw may go through another round of legalization later. + if (!FullFlatEmulation) { + // We inserted the runtime check already, make sure we do not try to + // re-expand this. + // TODO: Should union with any existing metadata. + MDBuilder MDB(F->getContext()); + MDNode *RangeNotPrivate = + MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS), + APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1)); + LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace, + RangeNotPrivate); + } + Builder.CreateBr(PhiBB); Builder.SetInsertPoint(PhiBB); @@ -16841,7 +16922,8 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { if (ReturnValueIsUsed) { PHINode *Loaded = Builder.CreatePHI(ValTy, 3); AI->replaceAllUsesWith(Loaded); - Loaded->addIncoming(LoadedShared, SharedBB); + if (FullFlatEmulation) + Loaded->addIncoming(LoadedShared, SharedBB); Loaded->addIncoming(LoadedPrivate, PrivateBB); Loaded->addIncoming(LoadedGlobal, GlobalBB); Loaded->takeName(AI); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 42a1ffb8a26d4..fce50b741bb63 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -348,6 +348,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> { let IsNonExtLoad = 1; } +def atomic_load_zext_glue : + PatFrag<(ops node:$ptr), (AMDGPUatomic_ld_glue node:$ptr)> { + let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic? + let IsZeroExtLoad = true; +} + +def atomic_load_sext_glue : + PatFrag<(ops node:$ptr), (AMDGPUatomic_ld_glue node:$ptr)> { + let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic? + let IsSignExtLoad = true; +} + def atomic_load_8_glue : PatFrag<(ops node:$ptr), (AMDGPUatomic_ld_glue node:$ptr)> { let IsAtomic = 1; @@ -372,6 +384,30 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr), let MemoryVT = i64; } +def atomic_load_zext_8_glue : PatFrag<(ops node:$ptr), + (atomic_load_zext_glue node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i8; +} + +def atomic_load_sext_8_glue : PatFrag<(ops node:$ptr), + (atomic_load_sext_glue node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i8; +} + +def atomic_load_zext_16_glue : PatFrag<(ops node:$ptr), + (atomic_load_zext_glue node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i16; +} + +def atomic_load_sext_16_glue : PatFrag<(ops node:$ptr), + (atomic_load_sext_glue node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i16; +} + def extload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> { let IsLoad = 1; let IsAnyExtLoad = 1; @@ -453,6 +489,15 @@ def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr), (atomic_load_32_glue node:$ptr)>; def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr), (atomic_load_64_glue node:$ptr)>; + +def atomic_load_zext_8_local_m0 : PatFrag<(ops node:$ptr), + (atomic_load_zext_8_glue node:$ptr)>; +def atomic_load_sext_8_local_m0 : PatFrag<(ops node:$ptr), + (atomic_load_sext_8_glue node:$ptr)>; +def atomic_load_zext_16_local_m0 : PatFrag<(ops node:$ptr), + (atomic_load_zext_16_glue node:$ptr)>; +def atomic_load_sext_16_local_m0 : PatFrag<(ops node:$ptr), + (atomic_load_sext_16_glue node:$ptr)>; } // End let AddressSpaces = LoadAddress_local.AddrSpaces diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index faa0b6d6c3f50..c8a46217190a1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3553,19 +3553,6 @@ def : AMDGPUPat < (V_BFE_U32_e64 $src, (i32 0), $width) >; -// x << (bitwidth - y) >> (bitwidth - y) -def : AMDGPUPat < - (DivergentBinFrag (shl_oneuse i32:$src, (sub 32, i32:$width)), - (sub 32, i32:$width)), - (V_BFE_U32_e64 $src, (i32 0), $width) ->; - -def : AMDGPUPat < - (DivergentBinFrag (shl_oneuse i32:$src, (sub 32, i32:$width)), - (sub 32, i32:$width)), - (V_BFE_I32_e64 $src, (i32 0), $width) ->; - // SHA-256 Ma patterns // ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 701aeda82c91e..6b50ed9593176 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1508,12 +1508,14 @@ defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>; defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>; defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>; -class MovDPP8Pattern : GCNPat < - (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)), +class MovDPP8Pattern : GCNPat < + (vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)), (Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> { let OtherPredicates = [Pred]; } -def : MovDPP8Pattern; -def : MovDPP8Pattern; -def : MovDPP8Pattern; +foreach vt = Reg32Types.types in { + def : MovDPP8Pattern; + def : MovDPP8Pattern; + def : MovDPP8Pattern; +} diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 5be9d73022a6e..8e79a0a344067 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1426,6 +1426,7 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8( // Use ScratchRegs to store the fp regs std::vector> ClearedFPRegs; std::vector NonclearedFPRegs; + bool ReturnsFPReg = false; for (const MachineOperand &Op : MBBI->operands()) { if (Op.isReg() && Op.isUse()) { Register Reg = Op.getReg(); @@ -1460,14 +1461,51 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8( NonclearedFPRegs.push_back(Reg); } } + } else if (Op.isReg() && Op.isDef()) { + Register Reg = Op.getReg(); + if (ARM::SPRRegClass.contains(Reg) || ARM::DPRRegClass.contains(Reg) || + ARM::QPRRegClass.contains(Reg)) + ReturnsFPReg = true; } } - bool passesFPReg = (!NonclearedFPRegs.empty() || !ClearedFPRegs.empty()); + bool PassesFPReg = (!NonclearedFPRegs.empty() || !ClearedFPRegs.empty()); - if (passesFPReg) + if (PassesFPReg || ReturnsFPReg) assert(STI->hasFPRegs() && "Subtarget needs fpregs"); + // CVE-2024-7883 + // + // The VLLDM/VLSTM instructions set up lazy state preservation, but they + // execute as NOPs if the FP register file is not considered to contain + // secure data, represented by the CONTROL_S.SFPA bit. This means that the + // state of CONTROL_S.SFPA must be the same when these two instructions are + // executed. That might not be the case if we haven't used any FP + // instructions before the VLSTM, so CONTROL_S.SFPA is clear, but do have one + // before the VLLDM, which sets it.. + // + // If we can't prove that SFPA will be the same for the VLSTM and VLLDM, we + // execute a "vmov s0, s0" instruction before the VLSTM to ensure that + // CONTROL_S.SFPA is set for both. + // + // That can only happen for callees which take no FP arguments (or we'd have + // inserted a VMOV above) and which return values in FP regs (so that we need + // to use a VMOV to back-up the return value before the VLLDM). It also can't + // happen if the call is dominated by other existing floating-point + // instructions, but we don't currently check for that case. + // + // These conditions mean that we only emit this instruction when using the + // hard-float ABI, which means we can assume that FP instructions are + // available, and don't need to make it conditional like we do for the + // CVE-2021-35465 workaround. + if (ReturnsFPReg && !PassesFPReg) { + bool S0Dead = !LiveRegs.contains(ARM::S0); + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVS)) + .addReg(ARM::S0, RegState::Define | getDeadRegState(S0Dead)) + .addReg(ARM::S0, getUndefRegState(S0Dead)) + .add(predOps(ARMCC::AL)); + } + // Lazy store all fp registers to the stack. // This executes as NOP in the absence of floating-point support. MachineInstrBuilder VLSTM = @@ -1528,7 +1566,7 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8( } // restore FPSCR from stack and clear bits 0-4, 7, 28-31 // The other bits are program global according to the AAPCS - if (passesFPReg) { + if (PassesFPReg) { BuildMI(MBB, MBBI, DL, TII->get(ARM::tLDRspi), SpareReg) .addReg(ARM::SP) .addImm(0x10) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index a98b7a8420927..5a72ef734e81d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -6051,7 +6051,7 @@ static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, DAG.getConstant((1 << BW) - 1, DL, VT)); if (IsSigned) Max = DAG.getNode(ISD::SMAX, DL, VT, Max, - DAG.getConstant(-(1 << BW), DL, VT)); + DAG.getSignedConstant(-(1 << BW), DL, VT)); return Max; } @@ -7951,6 +7951,8 @@ static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) { case ISD::MUL: case ISD::SADDSAT: case ISD::UADDSAT: + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: return true; case ISD::SUB: case ISD::SSUBSAT: diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 04d5d00eef10e..8c8403ac58b08 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2222,64 +2222,6 @@ defm MVE_VRHADDu8 : MVE_VRHADD; defm MVE_VRHADDu16 : MVE_VRHADD; defm MVE_VRHADDu32 : MVE_VRHADD; -// Rounding Halving Add perform the arithemtic operation with an extra bit of -// precision, before performing the shift, to void clipping errors. We're not -// modelling that here with these patterns, but we're using no wrap forms of -// add to ensure that the extra bit of information is not needed for the -// arithmetic or the rounding. -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), - (v16i8 (ARMvmovImm (i32 3585)))), - (i32 1))), - (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), - (v8i16 (ARMvmovImm (i32 2049)))), - (i32 1))), - (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), - (v4i32 (ARMvmovImm (i32 1)))), - (i32 1))), - (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), - (v16i8 (ARMvmovImm (i32 3585)))), - (i32 1))), - (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), - (v8i16 (ARMvmovImm (i32 2049)))), - (i32 1))), - (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), - (v4i32 (ARMvmovImm (i32 1)))), - (i32 1))), - (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; - - def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), - (v16i8 (ARMvdup (i32 1)))), - (i32 1))), - (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), - (v8i16 (ARMvdup (i32 1)))), - (i32 1))), - (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), - (v4i32 (ARMvdup (i32 1)))), - (i32 1))), - (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), - (v16i8 (ARMvdup (i32 1)))), - (i32 1))), - (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), - (v8i16 (ARMvdup (i32 1)))), - (i32 1))), - (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), - (v4i32 (ARMvdup (i32 1)))), - (i32 1))), - (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; -} - - class MVE_VHADDSUB size, list pattern=[]> : MVE_int { @@ -2303,8 +2245,7 @@ class MVE_VHSUB_ size, : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; multiclass MVE_VHADD_m { + SDPatternOperator unpred_op, Intrinsic PredInt> { def "" : MVE_VHADD_; defvar Inst = !cast(NAME); defm : MVE_TwoOpPattern(NAME)>; @@ -2313,26 +2254,18 @@ multiclass MVE_VHADD_m; - - def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))), - (Inst MQPR:$Qm, MQPR:$Qn)>; } } -multiclass MVE_VHADD - : MVE_VHADD_m; +multiclass MVE_VHADD + : MVE_VHADD_m; -// Halving add/sub perform the arithemtic operation with an extra bit of -// precision, before performing the shift, to void clipping errors. We're not -// modelling that here with these patterns, but we're using no wrap forms of -// add/sub to ensure that the extra bit of information is not needed. -defm MVE_VHADDs8 : MVE_VHADD; -defm MVE_VHADDs16 : MVE_VHADD; -defm MVE_VHADDs32 : MVE_VHADD; -defm MVE_VHADDu8 : MVE_VHADD; -defm MVE_VHADDu16 : MVE_VHADD; -defm MVE_VHADDu32 : MVE_VHADD; +defm MVE_VHADDs8 : MVE_VHADD; +defm MVE_VHADDs16 : MVE_VHADD; +defm MVE_VHADDs32 : MVE_VHADD; +defm MVE_VHADDu8 : MVE_VHADD; +defm MVE_VHADDu16 : MVE_VHADD; +defm MVE_VHADDu32 : MVE_VHADD; multiclass MVE_VHSUB_mgetTargetConstant(-((int)N->getZExtValue()), SDLoc(N), - MVT::i32); + return CurDAG->getSignedConstant(-((int)N->getZExtValue()), SDLoc(N), + MVT::i32, /*isTarget=*/true); }]>; // so_imm_notSext_XFORM - Return a so_imm value packed into the format diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 263ca50011aa7..1e8dc63ffa257 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -294,43 +294,6 @@ class Attributes attrs> { list op_attrs = attrs; } -class DXILConstant { - int value = value_; -} - -defset list BarrierModes = { - def BarrierMode_DeviceMemoryBarrier : DXILConstant<2>; - def BarrierMode_DeviceMemoryBarrierWithGroupSync : DXILConstant<3>; - def BarrierMode_GroupMemoryBarrier : DXILConstant<8>; - def BarrierMode_GroupMemoryBarrierWithGroupSync : DXILConstant<9>; - def BarrierMode_AllMemoryBarrier : DXILConstant<10>; - def BarrierMode_AllMemoryBarrierWithGroupSync : DXILConstant<11>; -} - -// Intrinsic arg selection -class Arg { - int index = -1; - DXILConstant value; - bit is_i8 = 0; - bit is_i32 = 0; -} -class ArgSelect : Arg { - let index = index_; -} -class ArgI32 : Arg { - let value = value_; - let is_i32 = 1; -} -class ArgI8 : Arg { - let value = value_; - let is_i8 = 1; -} - -class IntrinsicSelect args_> { - Intrinsic intrinsic = intrinsic_; - list args = args_; -} - // Abstraction DXIL Operation class DXILOp { // A short description of the operation @@ -345,9 +308,6 @@ class DXILOp { // LLVM Intrinsic DXIL Operation maps to Intrinsic LLVMIntrinsic = ?; - // Non-trivial LLVM Intrinsics DXIL Operation maps to - list intrinsic_selects = []; - // Result type of the op DXILOpParamType result; @@ -869,17 +829,3 @@ def WaveGetLaneIndex : DXILOp<111, waveGetLaneIndex> { let stages = [Stages]; let attributes = [Attributes]; } - -def Barrier : DXILOp<80, barrier> { - let Doc = "inserts a memory barrier in the shader"; - let intrinsic_selects = [ - IntrinsicSelect< - int_dx_group_memory_barrier_with_group_sync, - [ ArgI32 ]>, - ]; - - let arguments = [Int32Ty]; - let result = VoidTy; - let stages = [Stages]; - let attributes = [Attributes]; -} diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index b5cf1654181c6..8acc9c1efa08c 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -106,43 +106,17 @@ class OpLowerer { return false; } - struct ArgSelect { - enum class Type { - Index, - I8, - I32, - }; - Type Type = Type::Index; - int Value = -1; - }; - - [[nodiscard]] bool replaceFunctionWithOp(Function &F, dxil::OpCode DXILOp, - ArrayRef ArgSelects) { + [[nodiscard]] + bool replaceFunctionWithOp(Function &F, dxil::OpCode DXILOp) { bool IsVectorArgExpansion = isVectorArgExpansion(F); return replaceFunction(F, [&](CallInst *CI) -> Error { - OpBuilder.getIRB().SetInsertPoint(CI); SmallVector Args; - if (ArgSelects.size()) { - for (const ArgSelect &A : ArgSelects) { - switch (A.Type) { - case ArgSelect::Type::Index: - Args.push_back(CI->getArgOperand(A.Value)); - break; - case ArgSelect::Type::I8: - Args.push_back(OpBuilder.getIRB().getInt8((uint8_t)A.Value)); - break; - case ArgSelect::Type::I32: - Args.push_back(OpBuilder.getIRB().getInt32(A.Value)); - break; - default: - llvm_unreachable("Invalid type of intrinsic arg select."); - } - } - } else if (IsVectorArgExpansion) { - Args = argVectorFlatten(CI, OpBuilder.getIRB()); - } else { + OpBuilder.getIRB().SetInsertPoint(CI); + if (IsVectorArgExpansion) { + SmallVector NewArgs = argVectorFlatten(CI, OpBuilder.getIRB()); + Args.append(NewArgs.begin(), NewArgs.end()); + } else Args.append(CI->arg_begin(), CI->arg_end()); - } Expected OpCall = OpBuilder.tryCreateOp(DXILOp, Args, CI->getName(), F.getReturnType()); @@ -609,10 +583,9 @@ class OpLowerer { switch (ID) { default: continue; -#define DXIL_OP_INTRINSIC(OpCode, Intrin, ...) \ +#define DXIL_OP_INTRINSIC(OpCode, Intrin) \ case Intrin: \ - HasErrors |= \ - replaceFunctionWithOp(F, OpCode, ArrayRef{__VA_ARGS__}); \ + HasErrors |= replaceFunctionWithOp(F, OpCode); \ break; #include "DXILOperation.inc" case Intrinsic::dx_handle_fromBinding: diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td index 54ebf86666abf..ecd00cd6d5d61 100644 --- a/llvm/lib/Target/LoongArch/LoongArch.td +++ b/llvm/lib/Target/LoongArch/LoongArch.td @@ -135,7 +135,9 @@ include "LoongArchInstrInfo.td" //===----------------------------------------------------------------------===// def : ProcessorModel<"generic-la32", NoSchedModel, [Feature32Bit]>; -def : ProcessorModel<"generic-la64", NoSchedModel, [Feature64Bit, FeatureUAL]>; +def : ProcessorModel<"generic-la64", NoSchedModel, [Feature64Bit, + FeatureUAL, + FeatureExtLSX]>; // Generic 64-bit processor with double-precision floating-point support. def : ProcessorModel<"loongarch64", NoSchedModel, [Feature64Bit, diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp index e872ec443f87b..c88acdb9fb52f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp @@ -165,11 +165,9 @@ bool LoongArchPreRAExpandPseudo::expandMI( case LoongArch::PseudoLA_TLS_DESC_LARGE: return expandLoadAddressTLSDesc(MBB, MBBI, NextMBBI, /*Large=*/true); case LoongArch::PseudoCALL: - case LoongArch::PseudoCALL_MEDIUM: case LoongArch::PseudoCALL_LARGE: return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); case LoongArch::PseudoTAIL: - case LoongArch::PseudoTAIL_MEDIUM: case LoongArch::PseudoTAIL_LARGE: return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true); case LoongArch::PseudoBRIND: @@ -545,7 +543,7 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL( switch (MF->getTarget().getCodeModel()) { default: - report_fatal_error("Unsupported code model"); + report_fatal_error("Unexpected code model"); break; case CodeModel::Small: { // CALL: @@ -556,31 +554,6 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL( CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func); break; } - case CodeModel::Medium: { - // CALL: - // pcaddu18i $ra, %call36(func) - // jirl $ra, $ra, 0 - // TAIL: - // pcaddu18i $scratch, %call36(func) - // jirl $r0, $scratch, 0 - Opcode = - IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; - Register ScratchReg = - IsTailCall - ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) - : LoongArch::R1; - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg); - - CALL = - BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0); - - if (Func.isSymbol()) - MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36); - else - MIB.addDisp(Func, 0, LoongArchII::MO_CALL36); - break; - } case CodeModel::Large: { // Emit the 5-insn large address load sequence, either directly or // indirectly in case of going through the GOT, then JIRL_TAIL or @@ -671,6 +644,10 @@ class LoongArchExpandPseudo : public MachineFunctionPass { MachineBasicBlock::iterator &NextMBBI); bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); + bool expandFunctionCALL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + bool IsTailCall); }; char LoongArchExpandPseudo::ID = 0; @@ -705,6 +682,10 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB, switch (MBBI->getOpcode()) { case LoongArch::PseudoCopyCFR: return expandCopyCFR(MBB, MBBI, NextMBBI); + case LoongArch::PseudoCALL_MEDIUM: + return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); + case LoongArch::PseudoTAIL_MEDIUM: + return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true); } return false; @@ -763,6 +744,54 @@ bool LoongArchExpandPseudo::expandCopyCFR( return true; } +bool LoongArchExpandPseudo::expandFunctionCALL( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) { + MachineFunction *MF = MBB.getParent(); + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Func = MI.getOperand(0); + MachineInstrBuilder CALL; + unsigned Opcode; + + switch (MF->getTarget().getCodeModel()) { + default: + report_fatal_error("Unexpected code model"); + break; + case CodeModel::Medium: { + // CALL: + // pcaddu18i $ra, %call36(func) + // jirl $ra, $ra, 0 + // TAIL: + // pcaddu18i $t8, %call36(func) + // jirl $r0, $t8, 0 + Opcode = + IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; + Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg); + + CALL = + BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0); + + if (Func.isSymbol()) + MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36); + else + MIB.addDisp(Func, 0, LoongArchII::MO_CALL36); + break; + } + } + + // Transfer implicit operands. + CALL.copyImplicitOps(MI); + + // Transfer MI flags. + CALL.setMIFlags(MI.getFlags()); + + MI.eraseFromParent(); + return true; +} + } // end namespace INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo", diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index e2c644a56c95b..6bee00d1ce382 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -269,6 +269,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); } + for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) + setOperationAction(ISD::BSWAP, VT, Legal); for (MVT VT : {MVT::v4i32, MVT::v2i64}) { setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal); @@ -317,6 +319,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); } + for (MVT VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) + setOperationAction(ISD::BSWAP, VT, Legal); for (MVT VT : {MVT::v8i32, MVT::v4i32, MVT::v4i64}) { setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal); diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index a01f2ed3b5880..363cacf726c9c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -391,9 +391,6 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // // The following instruction patterns are prohibited from being reordered: // - // * pcaddu18 $ra, %call36(s) - // jirl $ra, $ra, 0 - // // * pcalau12i $a0, %pc_hi20(s) // addi.d $a1, $zero, %pc_lo12(s) // lu32i.d $a1, %pc64_lo20(s) @@ -413,10 +410,6 @@ bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // boundaries, and the instructions between them are guaranteed to be // ordered according to data dependencies. switch (MI.getOpcode()) { - case LoongArch::PCADDU18I: - if (MI.getOperand(1).getTargetFlags() == LoongArchII::MO_CALL36) - return true; - break; case LoongArch::PCALAU12I: { auto AddI = std::next(MII); if (AddI == MIE || AddI->getOpcode() != LoongArch::ADDI_D) diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 671b8cc6ffe1b..ccdd516572823 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -1484,7 +1484,7 @@ def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>; def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; // Function call with 'Medium' code model. -let isCall = 1, Defs = [R1] in +let isCall = 1, Defs = [R1, R20], Size = 8 in def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>; let Predicates = [IsLA64] in { @@ -1533,7 +1533,8 @@ def : Pat<(loongarch_tail (iPTR texternalsym:$dst)), (PseudoTAIL texternalsym:$dst)>; // Tail call with 'Medium' code model. -let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + Uses = [R3], Defs = [R20], Size = 8 in def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>; let Predicates = [IsLA64] in { diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index d13cc9af135b5..3e39e2c10a617 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1444,6 +1444,12 @@ def : Pat<(xor (v8i32 LASX256:$xj), (v8i32 (vsplat_uimm_pow2 uimm5:$imm))), def : Pat<(xor (v4i64 LASX256:$xj), (v4i64 (vsplat_uimm_pow2 uimm6:$imm))), (XVBITREVI_D LASX256:$xj, uimm6:$imm)>; +// Vector bswaps +def : Pat<(bswap (v16i16 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b10110001)>; +def : Pat<(bswap (v8i32 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b00011011)>; +def : Pat<(bswap (v4i64 LASX256:$xj)), + (XVSHUF4I_W (XVSHUF4I_B LASX256:$xj, 0b00011011), 0b10110001)>; + // XVFADD_{S/D} defm : PatXrXrF; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 86aa6dcfd8261..525d2802daa23 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1600,6 +1600,12 @@ def : Pat<(xor (v4i32 LSX128:$vj), (v4i32 (vsplat_uimm_pow2 uimm5:$imm))), def : Pat<(xor (v2i64 LSX128:$vj), (v2i64 (vsplat_uimm_pow2 uimm6:$imm))), (VBITREVI_D LSX128:$vj, uimm6:$imm)>; +// Vector bswaps +def : Pat<(bswap (v8i16 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b10110001)>; +def : Pat<(bswap (v4i32 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b00011011)>; +def : Pat<(bswap (v2i64 LSX128:$vj)), + (VSHUF4I_W (VSHUF4I_B LSX128:$vj, 0b00011011), 0b10110001)>; + // VFADD_{S/D} defm : PatVrVrF; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp index f89c78e75d3ee..074a58cadb556 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp @@ -16,10 +16,10 @@ using namespace llvm; -void MipsMCAsmInfo::anchor() { } +void MipsELFMCAsmInfo::anchor() {} -MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple, - const MCTargetOptions &Options) { +MipsELFMCAsmInfo::MipsELFMCAsmInfo(const Triple &TheTriple, + const MCTargetOptions &Options) { IsLittleEndian = TheTriple.isLittleEndian(); MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TheTriple, "", Options); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h index d8bfe58d24a83..b52ed12d3a0e7 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h @@ -18,12 +18,12 @@ namespace llvm { class Triple; -class MipsMCAsmInfo : public MCAsmInfoELF { +class MipsELFMCAsmInfo : public MCAsmInfoELF { void anchor() override; public: - explicit MipsMCAsmInfo(const Triple &TheTriple, - const MCTargetOptions &Options); + explicit MipsELFMCAsmInfo(const Triple &TheTriple, + const MCTargetOptions &Options); }; } // namespace llvm diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index ca95f67174da1..eff9ecf0d53d3 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -84,7 +84,7 @@ static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT, static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, const Triple &TT, const MCTargetOptions &Options) { - MCAsmInfo *MAI = new MipsMCAsmInfo(TT, Options); + MCAsmInfo *MAI = new MipsELFMCAsmInfo(TT, Options); unsigned SP = MRI.getDwarfRegNum(Mips::SP, true); MCCFIInstruction Inst = MCCFIInstruction::createDefCfaRegister(nullptr, SP); diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index f4af1d08dde5d..6b07999d862d9 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -730,70 +730,73 @@ printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O) { } void MipsAsmPrinter::emitStartOfAsmFile(Module &M) { - MipsTargetStreamer &TS = getTargetStreamer(); - - // MipsTargetStreamer has an initialization order problem when emitting an - // object file directly (see MipsTargetELFStreamer for full details). Work - // around it by re-initializing the PIC state here. - TS.setPic(OutContext.getObjectFileInfo()->isPositionIndependent()); - - // Try to get target-features from the first function. - StringRef FS = TM.getTargetFeatureString(); - Module::iterator F = M.begin(); - if (FS.empty() && M.size() && F->hasFnAttribute("target-features")) - FS = F->getFnAttribute("target-features").getValueAsString(); - - // Compute MIPS architecture attributes based on the default subtarget - // that we'd have constructed. - // FIXME: For ifunc related functions we could iterate over and look - // for a feature string that doesn't match the default one. const Triple &TT = TM.getTargetTriple(); - StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU()); - const MipsTargetMachine &MTM = static_cast(TM); - const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, std::nullopt); - - bool IsABICalls = STI.isABICalls(); - const MipsABIInfo &ABI = MTM.getABI(); - if (IsABICalls) { - TS.emitDirectiveAbiCalls(); - // FIXME: This condition should be a lot more complicated that it is here. - // Ideally it should test for properties of the ABI and not the ABI - // itself. - // For the moment, I'm only correcting enough to make MIPS-IV work. - if (!isPositionIndependent() && STI.hasSym32()) - TS.emitDirectiveOptionPic0(); - } - // Tell the assembler which ABI we are using - std::string SectionName = std::string(".mdebug.") + getCurrentABIString(); - OutStreamer->switchSection( - OutContext.getELFSection(SectionName, ELF::SHT_PROGBITS, 0)); + if (TT.isOSBinFormatELF()) { + MipsTargetStreamer &TS = getTargetStreamer(); + + // MipsTargetStreamer has an initialization order problem when emitting an + // object file directly (see MipsTargetELFStreamer for full details). Work + // around it by re-initializing the PIC state here. + TS.setPic(OutContext.getObjectFileInfo()->isPositionIndependent()); + + // Try to get target-features from the first function. + StringRef FS = TM.getTargetFeatureString(); + Module::iterator F = M.begin(); + if (FS.empty() && M.size() && F->hasFnAttribute("target-features")) + FS = F->getFnAttribute("target-features").getValueAsString(); + + // Compute MIPS architecture attributes based on the default subtarget + // that we'd have constructed. + // FIXME: For ifunc related functions we could iterate over and look + // for a feature string that doesn't match the default one. + StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU()); + const MipsTargetMachine &MTM = static_cast(TM); + const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM, + std::nullopt); + + bool IsABICalls = STI.isABICalls(); + const MipsABIInfo &ABI = MTM.getABI(); + if (IsABICalls) { + TS.emitDirectiveAbiCalls(); + // FIXME: This condition should be a lot more complicated that it is here. + // Ideally it should test for properties of the ABI and not the ABI + // itself. + // For the moment, I'm only correcting enough to make MIPS-IV work. + if (!isPositionIndependent() && STI.hasSym32()) + TS.emitDirectiveOptionPic0(); + } + + // Tell the assembler which ABI we are using + std::string SectionName = std::string(".mdebug.") + getCurrentABIString(); + OutStreamer->switchSection( + OutContext.getELFSection(SectionName, ELF::SHT_PROGBITS, 0)); - // NaN: At the moment we only support: - // 1. .nan legacy (default) - // 2. .nan 2008 - STI.isNaN2008() ? TS.emitDirectiveNaN2008() - : TS.emitDirectiveNaNLegacy(); + // NaN: At the moment we only support: + // 1. .nan legacy (default) + // 2. .nan 2008 + STI.isNaN2008() ? TS.emitDirectiveNaN2008() : TS.emitDirectiveNaNLegacy(); - // TODO: handle O64 ABI + // TODO: handle O64 ABI - TS.updateABIInfo(STI); + TS.updateABIInfo(STI); - // We should always emit a '.module fp=...' but binutils 2.24 does not accept - // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or - // -mfp64) and omit it otherwise. - if ((ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit())) || - STI.useSoftFloat()) - TS.emitDirectiveModuleFP(); + // We should always emit a '.module fp=...' but binutils 2.24 does not + // accept it. We therefore emit it when it contradicts the ABI defaults + // (-mfpxx or -mfp64) and omit it otherwise. + if ((ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit())) || + STI.useSoftFloat()) + TS.emitDirectiveModuleFP(); - // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not - // accept it. We therefore emit it when it contradicts the default or an - // option has changed the default (i.e. FPXX) and omit it otherwise. - if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX())) - TS.emitDirectiveModuleOddSPReg(); + // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not + // accept it. We therefore emit it when it contradicts the default or an + // option has changed the default (i.e. FPXX) and omit it otherwise. + if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX())) + TS.emitDirectiveModuleOddSPReg(); - // Switch to the .text section. - OutStreamer->switchSection(getObjFileLowering().getTextSection()); + // Switch to the .text section. + OutStreamer->switchSection(getObjFileLowering().getTextSection()); + } } void MipsAsmPrinter::emitInlineAsmStart() const { diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp index 7802767e31c2f..c7dbcc80148ae 100644 --- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp +++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp @@ -70,6 +70,10 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() { initializeMipsDAGToDAGISelLegacyPass(*PR); } +static std::unique_ptr createTLOF(const Triple &TT) { + return std::make_unique(); +} + static std::string computeDataLayout(const Triple &TT, StringRef CPU, const TargetOptions &Options, bool isLittle) { @@ -128,7 +132,7 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT, : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT, CPU, FS, Options, getEffectiveRelocModel(JIT, RM), getEffectiveCodeModel(CM, CodeModel::Small), OL), - isLittle(isLittle), TLOF(std::make_unique()), + isLittle(isLittle), TLOF(createTLOF(getTargetTriple())), ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)), Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this, std::nullopt), diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h index 815b600fe93a9..d06e2c00ec3f9 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h @@ -16,18 +16,10 @@ #ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXBASEINFO_H #define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXBASEINFO_H +#include "llvm/Support/NVPTXAddrSpace.h" namespace llvm { -enum AddressSpace { - ADDRESS_SPACE_GENERIC = 0, - ADDRESS_SPACE_GLOBAL = 1, - ADDRESS_SPACE_SHARED = 3, - ADDRESS_SPACE_CONST = 4, - ADDRESS_SPACE_LOCAL = 5, - - // NVVM Internal - ADDRESS_SPACE_PARAM = 101 -}; +using namespace NVPTXAS; namespace NVPTXII { enum { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index a95cba586b8fc..379a4a7764707 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -409,6 +409,13 @@ VectorizePTXValueVTs(const SmallVectorImpl &ValueVTs, return VectorInfo; } +static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT, + SDValue Value) { + if (Value->getValueType(0) == VT) + return Value; + return DAG.getNode(ISD::BITCAST, DL, VT, Value); +} + // NVPTXTargetLowering Constructor. NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI) @@ -551,6 +558,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); + + // Custom conversions to/from v2i8. + setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); + // Only logical ops can be done on v4i8 directly, others must be done // elementwise. setOperationAction( @@ -1335,8 +1346,6 @@ NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) return TypeSplitVector; - if (Isv2x16VT(VT)) - return TypeLegal; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -2311,6 +2320,30 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } +SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { + // Handle bitcasting from v2i8 without hitting the default promotion + // strategy which goes through stack memory. + EVT FromVT = Op->getOperand(0)->getValueType(0); + if (FromVT != MVT::v2i8) { + return Op; + } + + // Pack vector elements into i16 and bitcast to final type + SDLoc DL(Op); + SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, + Op->getOperand(0), DAG.getIntPtrConstant(0, DL)); + SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, + Op->getOperand(0), DAG.getIntPtrConstant(1, DL)); + SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0); + SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1); + SDValue Const8 = DAG.getConstant(8, DL, MVT::i16); + SDValue AsInt = DAG.getNode( + ISD::OR, DL, MVT::i16, + {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})}); + EVT ToVT = Op->getValueType(0); + return MaybeBitcast(DAG, DL, ToVT, AsInt); +} + // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it // would get lowered as two constant loads and vector-packing move. // Instead we want just a constant move: @@ -2320,32 +2353,33 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, EVT VT = Op->getValueType(0); if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) return Op; - SDLoc DL(Op); if (!llvm::all_of(Op->ops(), [](SDValue Operand) { return Operand->isUndef() || isa(Operand) || isa(Operand); })) { + if (VT != MVT::v4i8) + return Op; // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us // to optimize calculation of constant parts. - if (VT == MVT::v4i8) { - SDValue C8 = DAG.getConstant(8, DL, MVT::i32); - SDValue E01 = DAG.getNode( - NVPTXISD::BFI, DL, MVT::i32, - DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), - DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); - SDValue E012 = - DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, - DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), - E01, DAG.getConstant(16, DL, MVT::i32), C8); - SDValue E0123 = - DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, - DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), - E012, DAG.getConstant(24, DL, MVT::i32), C8); - return DAG.getNode(ISD::BITCAST, DL, VT, E0123); - } - return Op; + auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast, + uint64_t SelectionValue) -> SDValue { + SDValue L = Left; + SDValue R = Right; + if (Cast) { + L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32); + R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32); + } + return DAG.getNode( + NVPTXISD::PRMT, DL, MVT::v4i8, + {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32), + DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); + }; + auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340); + auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340); + auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410); + return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210); } // Get value or the Nth operand as an APInt(32). Undef values treated as 0. @@ -2376,8 +2410,8 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, } else { llvm_unreachable("Unsupported type"); } - SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32); - return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); + SDValue Const = DAG.getConstant(Value, DL, MVT::i32); + return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const); } SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, @@ -2818,6 +2852,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return Op; case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::BITCAST: + return LowerBITCAST(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return Op; case ISD::EXTRACT_VECTOR_ELT: @@ -6128,6 +6164,28 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } +static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, + SmallVectorImpl &Results) { + // Handle bitcasting to v2i8 without hitting the default promotion + // strategy which goes through stack memory. + SDValue Op(Node, 0); + EVT ToVT = Op->getValueType(0); + if (ToVT != MVT::v2i8) { + return; + } + + // Bitcast to i16 and unpack elements into a vector + SDLoc DL(Node); + SDValue AsInt = MaybeBitcast(DAG, DL, MVT::i16, Op->getOperand(0)); + SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt); + SDValue Const8 = DAG.getConstant(8, DL, MVT::i16); + SDValue Vec1 = + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8})); + Results.push_back( + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1})); +} + /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results) { @@ -6413,6 +6471,9 @@ void NVPTXTargetLowering::ReplaceNodeResults( switch (N->getOpcode()) { default: report_fatal_error("Unhandled custom legalization"); + case ISD::BITCAST: + ReplaceBITCAST(N, DAG, Results); + return; case ISD::LOAD: ReplaceLoadVector(N, DAG, Results); return; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 824a659671967..13153f4830b69 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -616,6 +616,8 @@ class NVPTXTargetLowering : public TargetLowering { const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; + SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index e35ba25b47880..3507573df1869 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -15,10 +15,12 @@ #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include using namespace llvm; @@ -117,7 +119,8 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { } // Convert NVVM intrinsics to target-generic LLVM code where possible. -static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { +static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, + IntrinsicInst *II) { // Each NVVM intrinsic we can simplify can be replaced with one of: // // * an LLVM intrinsic, @@ -413,11 +416,65 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { llvm_unreachable("All SpecialCase enumerators should be handled in switch."); } +// Returns an instruction pointer (may be nullptr if we do not know the answer). +// Returns nullopt if `II` is not one of the `isspacep` intrinsics. +static std::optional +handleSpaceCheckIntrinsics(InstCombiner &IC, IntrinsicInst &II) { + // Returns true/false when we know the answer, nullopt otherwise. + auto CheckASMatch = [](unsigned IID, unsigned AS) -> std::optional { + if (AS == NVPTXAS::ADDRESS_SPACE_GENERIC || + AS == NVPTXAS::ADDRESS_SPACE_PARAM) + return std::nullopt; // Got to check at run-time. + switch (IID) { + case Intrinsic::nvvm_isspacep_global: + return AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; + case Intrinsic::nvvm_isspacep_local: + return AS == NVPTXAS::ADDRESS_SPACE_LOCAL; + case Intrinsic::nvvm_isspacep_shared: + return AS == NVPTXAS::ADDRESS_SPACE_SHARED; + case Intrinsic::nvvm_isspacep_shared_cluster: + // We can't tell shared from shared_cluster at compile time from AS alone, + // but it can't be either is AS is not shared. + return AS == NVPTXAS::ADDRESS_SPACE_SHARED ? std::nullopt + : std::optional{false}; + case Intrinsic::nvvm_isspacep_const: + return AS == NVPTXAS::ADDRESS_SPACE_CONST; + default: + llvm_unreachable("Unexpected intrinsic"); + } + }; + + switch (auto IID = II.getIntrinsicID()) { + case Intrinsic::nvvm_isspacep_global: + case Intrinsic::nvvm_isspacep_local: + case Intrinsic::nvvm_isspacep_shared: + case Intrinsic::nvvm_isspacep_shared_cluster: + case Intrinsic::nvvm_isspacep_const: { + Value *Op0 = II.getArgOperand(0); + unsigned AS = Op0->getType()->getPointerAddressSpace(); + // Peek through ASC to generic AS. + // TODO: we could dig deeper through both ASCs and GEPs. + if (AS == NVPTXAS::ADDRESS_SPACE_GENERIC) + if (auto *ASCO = dyn_cast(Op0)) + AS = ASCO->getOperand(0)->getType()->getPointerAddressSpace(); + + if (std::optional Answer = CheckASMatch(IID, AS)) + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), *Answer)); + return nullptr; // Don't know the answer, got to check at run time. + } + default: + return std::nullopt; + } +} + std::optional NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { - if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) { + if (std::optional I = handleSpaceCheckIntrinsics(IC, II)) + return *I; + if (Instruction *I = convertNvvmIntrinsicToLlvm(IC, &II)) return I; - } + return std::nullopt; } diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td index 32cebb65cb569..92af04a4cff2d 100644 --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -825,7 +825,8 @@ def : InstRW<[P10W_F2_4C, P10W_DISP_ANY, P10F2_Read, P10F2_Read, P10F2_Read], def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read], (instrs SRADI_rec, - SRAWI_rec + SRAWI_rec, + SRAWI8_rec )>; // Single crack instructions @@ -833,7 +834,8 @@ def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read], def : InstRW<[P10W_F2_4C, P10W_DISP_EVEN, P10W_DISP_ANY, P10F2_Read, P10F2_Read], (instrs SRAD_rec, - SRAW_rec + SRAW_rec, + SRAW8_rec )>; // 2-way crack instructions @@ -926,7 +928,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read], SETNBC, SETNBC8, SETNBCR, SETNBCR8, SRADI, SRADI_32, - SRAWI, + SRAWI, SRAWI8, SUBFIC, SUBFIC8, SUBFME, SUBFME8, SUBFME8O, SUBFMEO, @@ -1008,7 +1010,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read, P10FX_Read], SLD, SLW, SLW8, SRAD, - SRAW, + SRAW, SRAW8, SRD, SRW, SRW8, SUBF, SUBF8, diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td index 395999c7242af..801ae83cd07c4 100644 --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -189,7 +189,8 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_3SLOTS_1C], (instregex "F(N)?ABS(D|S)$"), (instregex "FNEG(D|S)$"), (instregex "FCPSGN(D|S)$"), - (instregex "SRAW(I)?$"), + (instregex "SRAW(8)?$"), + (instregex "SRAWI(8)?$"), (instregex "ISEL(8)?$"), RLDIMI, XSIEXPDP, @@ -1091,7 +1092,8 @@ def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, (instregex "RLD(I)?C(R|L)_rec$"), (instregex "RLW(IMI|INM|NM)(8)?_rec$"), (instregex "SLW(8)?_rec$"), - (instregex "SRAW(I)?_rec$"), + (instregex "SRAW(8)?_rec$"), + (instregex "SRAWI(8)?_rec$"), (instregex "SRW(8)?_rec$"), RLDICL_32_rec, RLDIMI_rec diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 72c5909f10c3b..39da428461393 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -552,6 +552,18 @@ def getAltVSXFMAOpcode : InstrMapping { let ValueCols = [["1"]]; } +def get64BitInstrFromSignedExt32BitInstr : InstrMapping { + let FilterClass = "SExt32To64"; + // Instructions with the same opcode. + let RowFields = ["Inst"]; + // Instructions with the same Interpretation64Bit value form a column. + let ColFields = ["Interpretation64Bit"]; + // The key column are not the Interpretation64Bit-form instructions. + let KeyCol = ["0"]; + // Value columns are the Interpretation64Bit-form instructions. + let ValueCols = [["1"]]; +} + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index ae25f5c78a0e2..68419068e52a6 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -932,6 +932,14 @@ defm SLW8 : XForm_6r<31, 24, (outs g8rc:$RA), (ins g8rc:$RST, g8rc:$RB), "slw", "$RA, $RST, $RB", IIC_IntGeneral, []>, ZExt32To64; defm SRW8 : XForm_6r<31, 536, (outs g8rc:$RA), (ins g8rc:$RST, g8rc:$RB), "srw", "$RA, $RST, $RB", IIC_IntGeneral, []>, ZExt32To64; + +defm SRAW8 : XForm_6rc<31, 792, (outs g8rc:$RA), (ins g8rc:$RST, g8rc:$RB), + "sraw", "$RA, $RST, $RB", IIC_IntShift, + []>, SExt32To64; + +defm SRAWI8 : XForm_10rc<31, 824, (outs g8rc:$RA), (ins g8rc:$RST, u5imm:$RB), + "srawi", "$RA, $RST, $RB", IIC_IntShift, []>, SExt32To64; + } // Interpretation64Bit // For fast-isel: diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index bc2a1b295b433..3c331cee8f764 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -5250,6 +5250,215 @@ bool PPCInstrInfo::isTOCSaveMI(const MachineInstr &MI) const { // We limit the max depth to track incoming values of PHIs or binary ops // (e.g. AND) to avoid excessive cost. const unsigned MAX_BINOP_DEPTH = 1; + +// This function will promote the instruction which defines the register `Reg` +// in the parameter from a 32-bit to a 64-bit instruction if needed. The logic +// used to check whether an instruction needs to be promoted or not is similar +// to the logic used to check whether or not a defined register is sign or zero +// extended within the function PPCInstrInfo::isSignOrZeroExtended. +// Additionally, the `promoteInstr32To64ForElimEXTSW` function is recursive. +// BinOpDepth does not count all of the recursions. The parameter BinOpDepth is +// incremented only when `promoteInstr32To64ForElimEXTSW` calls itself more +// than once. This is done to prevent exponential recursion. +void PPCInstrInfo::promoteInstr32To64ForElimEXTSW(const Register &Reg, + MachineRegisterInfo *MRI, + unsigned BinOpDepth, + LiveVariables *LV) const { + if (!Reg.isVirtual()) + return; + + MachineInstr *MI = MRI->getVRegDef(Reg); + if (!MI) + return; + + unsigned Opcode = MI->getOpcode(); + + switch (Opcode) { + case PPC::OR: + case PPC::ISEL: + case PPC::OR8: + case PPC::PHI: { + if (BinOpDepth >= MAX_BINOP_DEPTH) + break; + unsigned OperandEnd = 3, OperandStride = 1; + if (Opcode == PPC::PHI) { + OperandEnd = MI->getNumOperands(); + OperandStride = 2; + } + + for (unsigned I = 1; I < OperandEnd; I += OperandStride) { + assert(MI->getOperand(I).isReg() && "Operand must be register"); + promoteInstr32To64ForElimEXTSW(MI->getOperand(I).getReg(), MRI, + BinOpDepth + 1, LV); + } + + break; + } + case PPC::COPY: { + // Refers to the logic of the `case PPC::COPY` statement in the function + // PPCInstrInfo::isSignOrZeroExtended(). + + Register SrcReg = MI->getOperand(1).getReg(); + // In both ELFv1 and v2 ABI, method parameters and the return value + // are sign- or zero-extended. + const MachineFunction *MF = MI->getMF(); + if (!MF->getSubtarget().isSVR4ABI()) { + // If this is a copy from another register, we recursively promote the + // source. + promoteInstr32To64ForElimEXTSW(SrcReg, MRI, BinOpDepth, LV); + return; + } + + // From here on everything is SVR4ABI. COPY will be eliminated in the other + // pass, we do not need promote the COPY pseudo opcode. + + if (SrcReg != PPC::X3) + // If this is a copy from another register, we recursively promote the + // source. + promoteInstr32To64ForElimEXTSW(SrcReg, MRI, BinOpDepth, LV); + return; + } + case PPC::ORI: + case PPC::XORI: + case PPC::ORIS: + case PPC::XORIS: + case PPC::ORI8: + case PPC::XORI8: + case PPC::ORIS8: + case PPC::XORIS8: + promoteInstr32To64ForElimEXTSW(MI->getOperand(1).getReg(), MRI, BinOpDepth, + LV); + break; + case PPC::AND: + case PPC::AND8: + if (BinOpDepth >= MAX_BINOP_DEPTH) + break; + + promoteInstr32To64ForElimEXTSW(MI->getOperand(1).getReg(), MRI, + BinOpDepth + 1, LV); + promoteInstr32To64ForElimEXTSW(MI->getOperand(2).getReg(), MRI, + BinOpDepth + 1, LV); + break; + } + + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + if (RC == &PPC::G8RCRegClass || RC == &PPC::G8RC_and_G8RC_NOX0RegClass) + return; + + const PPCInstrInfo *TII = + MI->getMF()->getSubtarget().getInstrInfo(); + + // Map the 32bit to 64bit opcodes for instructions that are not signed or zero + // extended themselves, but may have operands who's destination registers of + // signed or zero extended instructions. + std::unordered_map OpcodeMap = { + {PPC::OR, PPC::OR8}, {PPC::ISEL, PPC::ISEL8}, + {PPC::ORI, PPC::ORI8}, {PPC::XORI, PPC::XORI8}, + {PPC::ORIS, PPC::ORIS8}, {PPC::XORIS, PPC::XORIS8}, + {PPC::AND, PPC::AND8}}; + + int NewOpcode = -1; + auto It = OpcodeMap.find(Opcode); + if (It != OpcodeMap.end()) { + // Set the new opcode to the mapped 64-bit version. + NewOpcode = It->second; + } else { + if (!TII->isSExt32To64(Opcode)) + return; + + // The TableGen function `get64BitInstrFromSignedExt32BitInstr` is used to + // map the 32-bit instruction with the `SExt32To64` flag to the 64-bit + // instruction with the same opcode. + NewOpcode = PPC::get64BitInstrFromSignedExt32BitInstr(Opcode); + } + + assert(NewOpcode != -1 && + "Must have a 64-bit opcode to map the 32-bit opcode!"); + + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + const MCInstrDesc &MCID = TII->get(NewOpcode); + const TargetRegisterClass *NewRC = + TRI->getRegClass(MCID.operands()[0].RegClass); + + Register SrcReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg); + + // If the register class of the defined register in the 32-bit instruction + // is the same as the register class of the defined register in the promoted + // 64-bit instruction, we do not need to promote the instruction. + if (NewRC == SrcRC) + return; + + DebugLoc DL = MI->getDebugLoc(); + auto MBB = MI->getParent(); + + // Since the pseudo-opcode of the instruction is promoted from 32-bit to + // 64-bit, if the source reg class of the original instruction belongs to + // PPC::GRCRegClass or PPC::GPRC_and_GPRC_NOR0RegClass, we need to promote + // the operand to PPC::G8CRegClass or PPC::G8RC_and_G8RC_NOR0RegClass, + // respectively. + DenseMap PromoteRegs; + for (unsigned i = 1; i < MI->getNumOperands(); i++) { + MachineOperand &Operand = MI->getOperand(i); + if (!Operand.isReg()) + continue; + + Register OperandReg = Operand.getReg(); + if (!OperandReg.isVirtual()) + continue; + + const TargetRegisterClass *NewUsedRegRC = + TRI->getRegClass(MCID.operands()[i].RegClass); + const TargetRegisterClass *OrgRC = MRI->getRegClass(OperandReg); + if (NewUsedRegRC != OrgRC && (OrgRC == &PPC::GPRCRegClass || + OrgRC == &PPC::GPRC_and_GPRC_NOR0RegClass)) { + // Promote the used 32-bit register to 64-bit register. + Register TmpReg = MRI->createVirtualRegister(NewUsedRegRC); + Register DstTmpReg = MRI->createVirtualRegister(NewUsedRegRC); + BuildMI(*MBB, MI, DL, TII->get(PPC::IMPLICIT_DEF), TmpReg); + BuildMI(*MBB, MI, DL, TII->get(PPC::INSERT_SUBREG), DstTmpReg) + .addReg(TmpReg) + .addReg(OperandReg) + .addImm(PPC::sub_32); + PromoteRegs[i] = DstTmpReg; + } + } + + Register NewDefinedReg = MRI->createVirtualRegister(NewRC); + + BuildMI(*MBB, MI, DL, TII->get(NewOpcode), NewDefinedReg); + MachineBasicBlock::instr_iterator Iter(MI); + --Iter; + MachineInstrBuilder MIBuilder(*Iter->getMF(), Iter); + for (unsigned i = 1; i < MI->getNumOperands(); i++) { + if (PromoteRegs.find(i) != PromoteRegs.end()) + MIBuilder.addReg(PromoteRegs[i], RegState::Kill); + else + Iter->addOperand(MI->getOperand(i)); + } + + for (unsigned i = 1; i < Iter->getNumOperands(); i++) { + MachineOperand &Operand = Iter->getOperand(i); + if (!Operand.isReg()) + continue; + Register OperandReg = Operand.getReg(); + if (!OperandReg.isVirtual()) + continue; + LV->recomputeForSingleDefVirtReg(OperandReg); + } + + MI->eraseFromParent(); + + // A defined register may be used by other instructions that are 32-bit. + // After the defined register is promoted to 64-bit for the promoted + // instruction, we need to demote the 64-bit defined register back to a + // 32-bit register + BuildMI(*MBB, ++Iter, DL, TII->get(PPC::COPY), SrcReg) + .addReg(NewDefinedReg, RegState::Kill, PPC::sub_32); + LV->recomputeForSingleDefVirtReg(NewDefinedReg); + return; +} + // The isSignOrZeroExtended function is recursive. The parameter BinOpDepth // does not count all of the recursions. The parameter BinOpDepth is incremented // only when isSignOrZeroExtended calls itself more than once. This is done to diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 40996f6fbb75e..c2abf2f942746 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -17,6 +17,7 @@ #include "PPC.h" #include "PPCRegisterInfo.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER @@ -625,6 +626,10 @@ class PPCInstrInfo : public PPCGenInstrInfo { const MachineRegisterInfo *MRI) const { return isSignOrZeroExtended(Reg, 0, MRI).second; } + void promoteInstr32To64ForElimEXTSW(const Register &Reg, + MachineRegisterInfo *MRI, + unsigned BinOpDepth, + LiveVariables *LV) const; bool convertToImmediateForm(MachineInstr &MI, SmallSet &RegsToUpdate, diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index b8abee76cdfa8..b762cac8ea4f3 100644 --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -1053,7 +1053,16 @@ bool PPCMIPeephole::simplifyCode() { } else if (MI.getOpcode() == PPC::EXTSW_32_64 && TII->isSignExtended(NarrowReg, MRI)) { // We can eliminate EXTSW if the input is known to be already - // sign-extended. + // sign-extended. However, we are not sure whether a spill will occur + // during register allocation. If there is no promotion, it will use + // 'stw' instead of 'std', and 'lwz' instead of 'ld' when spilling, + // since the register class is 32-bits. Consequently, the high 32-bit + // information will be lost. Therefore, all these instructions in the + // chain used to deduce sign extension to eliminate the 'extsw' will + // need to be promoted to 64-bit pseudo instructions when the 'extsw' + // is eliminated. + TII->promoteInstr32To64ForElimEXTSW(NarrowReg, MRI, 0, LV); + LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n"); Register TmpReg = MF->getRegInfo().createVirtualRegister(&PPC::G8RCRegClass); diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP7.td b/llvm/lib/Target/PowerPC/PPCScheduleP7.td index 93399e5ddbca8..bf7f2f7a9c999 100644 --- a/llvm/lib/Target/PowerPC/PPCScheduleP7.td +++ b/llvm/lib/Target/PowerPC/PPCScheduleP7.td @@ -216,7 +216,7 @@ let SchedModel = P7Model in { RLWNM, RLWNM8, RLWNM_rec, RLDIMI, RLDIMI_rec, RLDICL_32, RLDICL_32_64, RLDICL_32_rec, RLDICR_32, RLWINM8_rec, RLWNM8_rec, SLD, SLD_rec, SLW, SLW8, SLW_rec, SLW8_rec, SRD, SRD_rec, SRW, SRW8, SRW_rec, - SRW8_rec, SRADI, SRADI_rec, SRAWI, SRAWI_rec, SRAD, SRAD_rec, SRAW, SRAW_rec, + SRW8_rec, SRADI, SRADI_rec, SRAWI, SRAWI_rec, SRAWI8, SRAWI8_rec, SRAD, SRAD_rec, SRAW, SRAW_rec, SRAW8, SRAW8_rec, SRADI_32, SUBFE, SUBFE8, SUBFE8O_rec, SUBFEO_rec )>; diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index cd188304595e1..133c47174570c 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -505,10 +505,13 @@ bool PPCPassConfig::addPreISel() { ? EnableGlobalMerge : (TM->getTargetTriple().isOSAIX() && getOptLevel() != CodeGenOptLevel::None)) - addPass( - createGlobalMergePass(TM, GlobalMergeMaxOffset, false, false, true)); + addPass(createGlobalMergePass(TM, GlobalMergeMaxOffset, false, false, true, + true)); - if (MergeStringPool && getOptLevel() != CodeGenOptLevel::None) + if ((MergeStringPool.getNumOccurrences() > 0) + ? MergeStringPool + : (TM->getTargetTriple().isOSLinux() && + getOptLevel() != CodeGenOptLevel::None)) addPass(createPPCMergeStringPoolPass()); if (!DisableInstrFormPrep && getOptLevel() != CodeGenOptLevel::None) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index b3a6cd40ea039..19103e219cb80 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -341,7 +341,9 @@ enum OperandType : unsigned { OPERAND_VEC_POLICY, // Vector SEW operand. OPERAND_SEW, - OPERAND_LAST_RISCV_IMM = OPERAND_SEW, + // Vector rounding mode for VXRM or FRM. + OPERAND_VEC_RM, + OPERAND_LAST_RISCV_IMM = OPERAND_VEC_RM, // Operand is either a register or uimm5, this is used by V extension pseudo // instructions to represent a value that be passed as AVL to either vsetvli // or vsetivli. diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index dc3f8254cb4e0..6291842e071a3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -693,7 +693,7 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) { // The constants that can be encoded in the THeadMemIdx instructions // are of the form (sign_extend(imm5) << imm2). - int64_t Shift; + unsigned Shift; for (Shift = 0; Shift < 4; Shift++) if (isInt<5>(Offset >> Shift) && ((Offset % (1LL << Shift)) == 0)) break; @@ -3366,7 +3366,7 @@ bool RISCVDAGToDAGISel::selectSimm5Shl2(SDValue N, SDValue &Simm5, SDValue &Shl2) { if (auto *C = dyn_cast(N)) { int64_t Offset = C->getSExtValue(); - int64_t Shift; + unsigned Shift; for (Shift = 0; Shift < 4; Shift++) if (isInt<5>(Offset >> Shift) && ((Offset % (1LL << Shift)) == 0)) break; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index af7a39b2580a3..920b06c7ba6ec 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1403,7 +1403,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } setOperationAction({ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE, - ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, + ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, + ISD::SCALAR_TO_VECTOR}, VT, Custom); setOperationAction( @@ -3030,6 +3031,7 @@ static RISCVFPRndMode::RoundingMode matchRoundingOp(unsigned Opc) { case ISD::VP_FROUND: return RISCVFPRndMode::RMM; case ISD::FRINT: + case ISD::VP_FRINT: return RISCVFPRndMode::DYN; } @@ -3101,6 +3103,8 @@ lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG, switch (Op.getOpcode()) { default: llvm_unreachable("Unexpected opcode"); + case ISD::FRINT: + case ISD::VP_FRINT: case ISD::FCEIL: case ISD::VP_FCEIL: case ISD::FFLOOR: @@ -3120,10 +3124,6 @@ lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG, Truncated = DAG.getNode(RISCVISD::VFCVT_RTZ_X_F_VL, DL, IntVT, Src, Mask, VL); break; - case ISD::FRINT: - case ISD::VP_FRINT: - Truncated = DAG.getNode(RISCVISD::VFCVT_X_F_VL, DL, IntVT, Src, Mask, VL); - break; case ISD::FNEARBYINT: case ISD::VP_FNEARBYINT: Truncated = DAG.getNode(RISCVISD::VFROUND_NOEXCEPT_VL, DL, ContainerVT, Src, @@ -3294,8 +3294,10 @@ static SDValue lowerVectorXRINT(SDValue Op, SelectionDAG &DAG, } auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); - SDValue Truncated = - DAG.getNode(RISCVISD::VFCVT_X_F_VL, DL, ContainerVT, Src, Mask, VL); + SDValue Truncated = DAG.getNode( + RISCVISD::VFCVT_RM_X_F_VL, DL, ContainerVT, Src, Mask, + DAG.getTargetConstant(RISCVFPRndMode::DYN, DL, Subtarget.getXLenVT()), + VL); if (!VT.isFixedLengthVector()) return Truncated; @@ -3990,10 +3992,9 @@ static SDValue lowerBuildVectorViaPacking(SDValue Op, SelectionDAG &DAG, A = DAG.getNode(ISD::AND, SDLoc(A), XLenVT, A, Mask); B = DAG.getNode(ISD::AND, SDLoc(B), XLenVT, B, Mask); SDValue ShtAmt = DAG.getConstant(ElemSizeInBits, ElemDL, XLenVT); - SDNodeFlags Flags; - Flags.setDisjoint(true); return DAG.getNode(ISD::OR, ElemDL, XLenVT, A, - DAG.getNode(ISD::SHL, ElemDL, XLenVT, B, ShtAmt), Flags); + DAG.getNode(ISD::SHL, ElemDL, XLenVT, B, ShtAmt), + SDNodeFlags::Disjoint); }; SmallVector NewOperands; @@ -6022,11 +6023,8 @@ static SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG, SDValue ClearedSign = DAG.getNode(ISD::AND, DL, XLenVT, MagAsInt, ClearSignMask); - SDNodeFlags Flags; - Flags.setDisjoint(true); - - SDValue CopiedSign = - DAG.getNode(ISD::OR, DL, XLenVT, ClearedSign, SignBit, Flags); + SDValue CopiedSign = DAG.getNode(ISD::OR, DL, XLenVT, ClearedSign, SignBit, + SDNodeFlags::Disjoint); return DAG.getNode(RISCVISD::FMV_H_X, DL, VT, CopiedSign); } @@ -6170,7 +6168,7 @@ static unsigned getRISCVVLOp(SDValue Op) { case ISD::VP_LRINT: case ISD::LLRINT: case ISD::VP_LLRINT: - return RISCVISD::VFCVT_X_F_VL; + return RISCVISD::VFCVT_RM_X_F_VL; } // clang-format on #undef OP_CASE @@ -6183,7 +6181,7 @@ static bool hasPassthruOp(unsigned Opcode) { Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE && "not a RISC-V target specific op"); static_assert(RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == - 130 && + 128 && RISCVISD::LAST_RISCV_STRICTFP_OPCODE - ISD::FIRST_TARGET_STRICTFP_OPCODE == 21 && @@ -6209,7 +6207,7 @@ static bool hasMaskOp(unsigned Opcode) { Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE && "not a RISC-V target specific op"); static_assert(RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == - 130 && + 128 && RISCVISD::LAST_RISCV_STRICTFP_OPCODE - ISD::FIRST_TARGET_STRICTFP_OPCODE == 21 && @@ -6514,9 +6512,16 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, if (VT.isFixedLengthVector()) ContainerVT = getContainerForFixedLengthVector(VT); SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; - Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), Scalar); - SDValue V = DAG.getNode(RISCVISD::VMV_S_X_VL, DL, ContainerVT, - DAG.getUNDEF(ContainerVT), Scalar, VL); + + SDValue V; + if (VT.isFloatingPoint()) { + V = DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), Scalar, VL); + } else { + Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), Scalar); + V = DAG.getNode(RISCVISD::VMV_S_X_VL, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), Scalar, VL); + } if (VT.isFixedLengthVector()) V = convertFromScalableVector(VT, V, DAG, Subtarget); return V; @@ -11117,6 +11122,7 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op, SDValue BasePtr = MemSD->getBasePtr(); SDValue Mask, PassThru, VL; + bool IsExpandingLoad = false; if (const auto *VPLoad = dyn_cast(Op)) { Mask = VPLoad->getMask(); PassThru = DAG.getUNDEF(VT); @@ -11125,6 +11131,7 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op, const auto *MLoad = cast(Op); Mask = MLoad->getMask(); PassThru = MLoad->getPassThru(); + IsExpandingLoad = MLoad->isExpandingLoad(); } bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode()); @@ -11144,18 +11151,26 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op, if (!VL) VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; - unsigned IntID = - IsUnmasked ? Intrinsic::riscv_vle : Intrinsic::riscv_vle_mask; + SDValue ExpandingVL; + if (!IsUnmasked && IsExpandingLoad) { + ExpandingVL = VL; + VL = + DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Mask, + getAllOnesMask(Mask.getSimpleValueType(), VL, DL, DAG), VL); + } + + unsigned IntID = IsUnmasked || IsExpandingLoad ? Intrinsic::riscv_vle + : Intrinsic::riscv_vle_mask; SmallVector Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)}; - if (IsUnmasked) + if (IntID == Intrinsic::riscv_vle) Ops.push_back(DAG.getUNDEF(ContainerVT)); else Ops.push_back(PassThru); Ops.push_back(BasePtr); - if (!IsUnmasked) + if (IntID == Intrinsic::riscv_vle_mask) Ops.push_back(Mask); Ops.push_back(VL); - if (!IsUnmasked) + if (IntID == Intrinsic::riscv_vle_mask) Ops.push_back(DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT)); SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); @@ -11163,6 +11178,32 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op, SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO); Chain = Result.getValue(1); + if (ExpandingVL) { + MVT IndexVT = ContainerVT; + if (ContainerVT.isFloatingPoint()) + IndexVT = ContainerVT.changeVectorElementTypeToInteger(); + + MVT IndexEltVT = IndexVT.getVectorElementType(); + bool UseVRGATHEREI16 = false; + // If index vector is an i8 vector and the element count exceeds 256, we + // should change the element type of index vector to i16 to avoid + // overflow. + if (IndexEltVT == MVT::i8 && VT.getVectorNumElements() > 256) { + // FIXME: We need to do vector splitting manually for LMUL=8 cases. + assert(getLMUL(IndexVT) != RISCVII::LMUL_8); + IndexVT = IndexVT.changeVectorElementType(MVT::i16); + UseVRGATHEREI16 = true; + } + + SDValue Iota = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, + DAG.getConstant(Intrinsic::riscv_viota, DL, XLenVT), + DAG.getUNDEF(IndexVT), Mask, ExpandingVL); + Result = + DAG.getNode(UseVRGATHEREI16 ? RISCVISD::VRGATHEREI16_VV_VL + : RISCVISD::VRGATHER_VV_VL, + DL, ContainerVT, Result, Iota, PassThru, Mask, ExpandingVL); + } if (VT.isFixedLengthVector()) Result = convertFromScalableVector(VT, Result, DAG, Subtarget); @@ -11549,6 +11590,11 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const { } } } + // VFCVT_RM_X_F_VL requires a rounding mode to be injected before the VL. + if (RISCVISDOpc == RISCVISD::VFCVT_RM_X_F_VL && + ISD::getVPExplicitVectorLengthIdx(Op.getOpcode()) == OpIdx.index()) + Ops.push_back(DAG.getTargetConstant(RISCVFPRndMode::DYN, DL, + Subtarget.getXLenVT())); // Pass through operands which aren't fixed-length vectors. if (!V.getValueType().isFixedLengthVector()) { Ops.push_back(V); @@ -13291,9 +13337,8 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG, EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, RHSIdx + 1); SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec, DAG.getVectorIdxConstant(0, DL)); - auto Flags = ReduceVec->getFlags(); - Flags.intersectWith(N->getFlags()); - return DAG.getNode(ReduceOpc, DL, VT, Vec, Flags); + return DAG.getNode(ReduceOpc, DL, VT, Vec, + ReduceVec->getFlags() & N->getFlags()); } return SDValue(); @@ -15710,10 +15755,6 @@ static SDValue performFP_TO_INTCombine(SDNode *N, unsigned Opc = IsSigned ? RISCVISD::VFCVT_RTZ_X_F_VL : RISCVISD::VFCVT_RTZ_XU_F_VL; FpToInt = DAG.getNode(Opc, DL, ContainerVT, XVal, Mask, VL); - } else if (FRM == RISCVFPRndMode::DYN) { - unsigned Opc = - IsSigned ? RISCVISD::VFCVT_X_F_VL : RISCVISD::VFCVT_XU_F_VL; - FpToInt = DAG.getNode(Opc, DL, ContainerVT, XVal, Mask, VL); } else { unsigned Opc = IsSigned ? RISCVISD::VFCVT_RM_X_F_VL : RISCVISD::VFCVT_RM_XU_F_VL; @@ -20277,8 +20318,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VFCVT_RTZ_XU_F_VL) NODE_NAME_CASE(VFCVT_RM_X_F_VL) NODE_NAME_CASE(VFCVT_RM_XU_F_VL) - NODE_NAME_CASE(VFCVT_X_F_VL) - NODE_NAME_CASE(VFCVT_XU_F_VL) NODE_NAME_CASE(VFROUND_NOEXCEPT_VL) NODE_NAME_CASE(SINT_TO_FP_VL) NODE_NAME_CASE(UINT_TO_FP_VL) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 0b07ad7d7a423..9ae70d257fa44 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -307,8 +307,6 @@ enum NodeType : unsigned { FCOPYSIGN_VL, // Has a passthru operand VFCVT_RTZ_X_F_VL, VFCVT_RTZ_XU_F_VL, - VFCVT_X_F_VL, - VFCVT_XU_F_VL, VFROUND_NOEXCEPT_VL, VFCVT_RM_X_F_VL, // Has a rounding mode operand. VFCVT_RM_XU_F_VL, // Has a rounding mode operand. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index d5b086861d71e..04bb964bfc48c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2551,6 +2551,13 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_SEW: Ok = Imm == 0 || (Imm >= 3 && Imm <= 6); break; + case RISCVOp::OPERAND_VEC_RM: + assert(RISCVII::hasRoundModeOp(Desc.TSFlags)); + if (RISCVII::usesVXRM(Desc.TSFlags)) + Ok = isUInt<2>(Imm); + else + Ok = RISCVFPRndMode::isValidRoundingMode(Imm); + break; } if (!Ok) { ErrInfo = "Invalid immediate"; @@ -2623,6 +2630,13 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (int Idx = RISCVII::getFRMOpNum(Desc); + Idx >= 0 && MI.getOperand(Idx).getImm() == RISCVFPRndMode::DYN && + !MI.readsRegister(RISCV::FRM, /*TRI=*/nullptr)) { + ErrInfo = "dynamic rounding mode should read FRM"; + return false; + } + return true; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 6ffdae1d7df2a..89e71b7c22c12 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -92,6 +92,10 @@ def sew : RISCVOp { let OperandType = "OPERAND_SEW"; } +def vec_rm : RISCVOp { + let OperandType = "OPERAND_VEC_RM"; +} + // X0 has special meaning for vsetvl/vsetvli. // rd | rs1 | AVL value | Effect on vl //-------------------------------------------------------------- @@ -877,7 +881,7 @@ class VPseudoILoadNoMask LMUL, bit Ordered, bit EarlyClobber, - int TargetConstraintType = 1> : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$dest, GPRMem:$rs1, IdxClass:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []>, @@ -899,7 +903,7 @@ class VPseudoILoadMask LMUL, bit Ordered, bit EarlyClobber, - int TargetConstraintType = 1> : + bits<2> TargetConstraintType = 1> : Pseudo<(outs GetVRegNoV0.R:$rd), (ins GetVRegNoV0.R:$passthru, GPRMem:$rs1, IdxClass:$rs2, @@ -1021,7 +1025,7 @@ class VPseudoNullaryPseudoM : class VPseudoUnaryNoMask : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$passthru, OpClass:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []>, @@ -1039,7 +1043,7 @@ class VPseudoUnaryNoMask : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins OpClass:$rs2, AVL:$vl, sew:$sew), []>, RISCVVPseudo { @@ -1055,9 +1059,9 @@ class VPseudoUnaryNoMaskNoPolicy : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, OpClass:$rs2, ixlenimm:$rm, + (ins RetClass:$passthru, OpClass:$rs2, vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVPseudo { let mayLoad = 0; @@ -1070,12 +1074,13 @@ class VPseudoUnaryNoMaskRoundingMode : + bits<2> TargetConstraintType = 1> : Pseudo<(outs GetVRegNoV0.R:$rd), (ins GetVRegNoV0.R:$passthru, OpClass:$rs2, VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, @@ -1094,10 +1099,10 @@ class VPseudoUnaryMask : + bits<2> TargetConstraintType = 1> : Pseudo<(outs GetVRegNoV0.R:$rd), (ins GetVRegNoV0.R:$passthru, OpClass:$rs2, - VMaskOp:$vm, ixlenimm:$rm, + VMaskOp:$vm, vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVPseudo { let mayLoad = 0; @@ -1111,6 +1116,7 @@ class VPseudoUnaryMaskRoundingMode : - Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, OpClass:$rs2, ixlenimm:$frm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let Constraints = !interleave([Constraint, "$rd = $passthru"], ","); - let TargetOverlapConstraintType = TargetConstraintType; - let HasVLOp = 1; - let HasSEWOp = 1; - let HasVecPolicyOp = 1; - let HasRoundModeOp = 1; -} - -class VPseudoUnaryMask_FRM : - Pseudo<(outs GetVRegNoV0.R:$rd), - (ins GetVRegNoV0.R:$passthru, OpClass:$rs2, - VMaskOp:$vm, ixlenimm:$frm, - AVL:$vl, sew:$sew, vec_policy:$policy), []>, - RISCVVPseudo { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let Constraints = !interleave([Constraint, "$rd = $passthru"], ","); - let TargetOverlapConstraintType = TargetConstraintType; - let HasVLOp = 1; - let HasSEWOp = 1; - let HasVecPolicyOp = 1; - let UsesMaskPolicy = 1; - let HasRoundModeOp = 1; -} - class VPseudoUnaryNoMaskGPROut : Pseudo<(outs GPR:$rd), (ins VR:$rs2, AVL:$vl, sew:$sew), []>, @@ -1211,7 +1177,7 @@ class VPseudoBinaryNoMask : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew), []>, RISCVVPseudo { @@ -1228,7 +1194,7 @@ class VPseudoBinaryNoMaskPolicy : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew, vec_policy:$policy), []>, @@ -1247,10 +1213,10 @@ class VPseudoBinaryNoMaskRoundingMode : + bit UsesVXRM_ = 1, + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), - (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, ixlenimm:$rm, + (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVPseudo { let mayLoad = 0; @@ -1262,18 +1228,19 @@ class VPseudoBinaryNoMaskRoundingMode : + bit UsesVXRM_, + bits<2> TargetConstraintType = 1> : Pseudo<(outs GetVRegNoV0.R:$rd), (ins GetVRegNoV0.R:$passthru, Op1Class:$rs2, Op2Class:$rs1, - VMaskOp:$vm, ixlenimm:$rm, AVL:$vl, + VMaskOp:$vm, vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVPseudo { let mayLoad = 0; @@ -1286,6 +1253,7 @@ class VPseudoBinaryMaskPolicyRoundingMode : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$rs2, Op2Class:$rs1, AVL:$vl, sew:$sew, vec_policy:$policy), []>, @@ -1314,10 +1282,10 @@ class VPseudoTiedBinaryNoMask : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$rs2, Op2Class:$rs1, - ixlenimm:$rm, + vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVPseudo { @@ -1333,6 +1301,7 @@ class VPseudoTiedBinaryNoMaskRoundingMode LMUL, @@ -1367,7 +1336,7 @@ class VPseudoBinaryMaskPolicy : + bits<2> TargetConstraintType = 1> : Pseudo<(outs GetVRegNoV0.R:$rd), (ins GetVRegNoV0.R:$passthru, Op1Class:$rs2, Op2Class:$rs1, @@ -1408,7 +1377,7 @@ class VPseudoTernaryMaskPolicyRoundingMode.R:$passthru, Op1Class:$rs2, Op2Class:$rs1, VMaskOp:$vm, - ixlenimm:$rm, + vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVPseudo { let mayLoad = 0; @@ -1420,6 +1389,7 @@ class VPseudoTernaryMaskPolicyRoundingMode : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, @@ -1449,7 +1419,7 @@ class VPseudoBinaryMOutMask : + bits<2> TargetConstraintType = 1> : Pseudo<(outs GetVRegNoV0.R:$rd), (ins GetVRegNoV0.R:$passthru, Op2Class:$rs1, @@ -1470,12 +1440,12 @@ class VPseudoTiedBinaryMask : + bits<2> TargetConstraintType = 1> : Pseudo<(outs GetVRegNoV0.R:$rd), (ins GetVRegNoV0.R:$passthru, Op2Class:$rs1, VMaskOp:$vm, - ixlenimm:$rm, + vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVPseudo { let mayLoad = 0; @@ -1490,6 +1460,7 @@ class VPseudoTiedBinaryMaskRoundingMode : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), !if(CarryIn, (ins Op1Class:$rs2, Op2Class:$rs1, @@ -1520,7 +1491,7 @@ class VPseudoTiedBinaryCarryIn : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, AVL:$vl, sew:$sew), []>, @@ -1556,7 +1527,7 @@ class VPseudoTernaryNoMaskWithPolicy : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, AVL:$vl, sew:$sew, vec_policy:$policy), []>, @@ -1575,10 +1546,10 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode : + bits<2> TargetConstraintType = 1> : Pseudo<(outs RetClass:$rd), (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2, - ixlenimm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, + vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy), []>, RISCVVPseudo { let mayLoad = 0; let mayStore = 0; @@ -1590,6 +1561,7 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode TargetConstraintType = 1, bit Commutable = 0> { let VLMul = MInfo.value, SEW=sew, isCommutable = Commutable in { defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); @@ -2123,8 +2095,8 @@ multiclass VPseudoBinaryRoundingMode TargetConstraintType = 1, bit Commutable = 0> { let VLMul = MInfo.value, SEW=sew, isCommutable = Commutable in { defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); @@ -2147,7 +2119,7 @@ multiclass VPseudoBinaryM TargetConstraintType = 1, bit Commutable = 0> { let VLMul = MInfo.value, isCommutable = Commutable in { def "_" # MInfo.MX : VPseudoBinaryNoMask { + bits<2> TargetConstraintType = 1> { let VLMul = MInfo.value in { def "_" # MInfo.MX # "_TIED": VPseudoTiedBinaryNoMask; @@ -2195,7 +2167,7 @@ multiclass VPseudoTiedBinaryRoundingMode { + bits<2> TargetConstraintType = 1> { defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); let VLMul = MInfo.value in { def suffix # "_TIED": @@ -2417,7 +2389,7 @@ multiclass VPseudoBinaryV_WI_RM { multiclass VPseudoBinaryV_VM { + bits<2> TargetConstraintType = 1> { let isCommutable = Commutable in def "_VV" # !if(CarryIn, "M", "") # "_" # m.MX : VPseudoBinaryCarry { } multiclass VPseudoBinaryV_XM { + string Constraint = "", bits<2> TargetConstraintType = 1> { def "_VX" # !if(CarryIn, "M", "") # "_" # m.MX : VPseudoBinaryCarry { + string Constraint = "", bits<2> TargetConstraintType = 1> { def "_VI" # !if(CarryIn, "M", "") # "_" # m.MX : VPseudoBinaryCarry TargetConstraintType = 1, bit Commutable = 0> { defm _VV : VPseudoBinaryM; } -multiclass VPseudoBinaryM_VX { +multiclass VPseudoBinaryM_VX TargetConstraintType = 1> { defm "_VX" : VPseudoBinaryM; } -multiclass VPseudoBinaryM_VF { +multiclass VPseudoBinaryM_VF TargetConstraintType = 1> { defm "_V" # f.FX : VPseudoBinaryM; } -multiclass VPseudoBinaryM_VI { +multiclass VPseudoBinaryM_VI TargetConstraintType = 1> { defm _VI : VPseudoBinaryM; } @@ -3202,7 +3174,7 @@ multiclass VPseudoTernaryWithPolicy { + bits<2> TargetConstraintType = 1> { let VLMul = MInfo.value in { let isCommutable = Commutable in def "_" # MInfo.MX : VPseudoTernaryNoMaskWithPolicy; @@ -3218,7 +3190,7 @@ multiclass VPseudoTernaryWithPolicyRoundingMode { + bits<2> TargetConstraintType = 1> { let VLMul = MInfo.value in { defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); let isCommutable = Commutable in @@ -3548,7 +3520,7 @@ multiclass VPseudoConversion { + bits<2> TargetConstraintType = 1> { defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); let VLMul = MInfo.value, SEW=sew in { def suffix : VPseudoUnaryNoMask; @@ -3563,7 +3535,7 @@ multiclass VPseudoConversionRoundingMode { + bits<2> TargetConstraintType = 1> { let VLMul = MInfo.value, SEW=sew in { defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); def suffix : VPseudoUnaryNoMaskRoundingMode; @@ -3574,23 +3546,6 @@ multiclass VPseudoConversionRoundingMode { - let VLMul = MInfo.value, SEW=sew in { - defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); - def suffix : VPseudoUnaryNoMask_FRM; - def suffix # "_MASK" : VPseudoUnaryMask_FRM, - RISCVMaskedPseudo; - } -} - multiclass VPseudoConversionNoExcept, - SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX, - forcePassthruRead=true>; - } -} - multiclass VPseudoVFROUND_NOEXCEPT_V { foreach m = MxListF in { defm _V : VPseudoConversionNoExcept, @@ -3641,15 +3588,6 @@ multiclass VPseudoVCVTF_V_RM { } } -multiclass VPseudoVCVTF_RM_V { - foreach m = MxListF in { - foreach e = SchedSEWSet.val in - defm _V : VPseudoConversionRM, - SchedUnary<"WriteVFCvtIToFV", "ReadVFCvtIToFV", m.MX, e, - forcePassthruRead=true>; - } -} - multiclass VPseudoVWCVTI_V { defvar constraint = "@earlyclobber $rd"; foreach m = MxListFW in { @@ -3668,15 +3606,6 @@ multiclass VPseudoVWCVTI_V_RM { } } -multiclass VPseudoVWCVTI_RM_V { - defvar constraint = "@earlyclobber $rd"; - foreach m = MxListFW in { - defm _V : VPseudoConversionRM, - SchedUnary<"WriteVFWCvtFToIV", "ReadVFWCvtFToIV", m.MX, - forcePassthruRead=true>; - } -} - multiclass VPseudoVWCVTF_V { defvar constraint = "@earlyclobber $rd"; foreach m = MxListW in { @@ -3717,15 +3646,6 @@ multiclass VPseudoVNCVTI_W_RM { } } -multiclass VPseudoVNCVTI_RM_W { - defvar constraint = "@earlyclobber $rd"; - foreach m = MxListW in { - defm _W : VPseudoConversionRM, - SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX, - forcePassthruRead=true>; - } -} - multiclass VPseudoVNCVTF_W_RM { defvar constraint = "@earlyclobber $rd"; foreach m = MxListFW in { @@ -3738,17 +3658,6 @@ multiclass VPseudoVNCVTF_W_RM { } } -multiclass VPseudoVNCVTF_RM_W { - defvar constraint = "@earlyclobber $rd"; - foreach m = MxListFW in { - foreach e = SchedSEWSet.val in - defm _W : VPseudoConversionRM, - SchedUnary<"WriteVFNCvtIToFV", "ReadVFNCvtIToFV", m.MX, e, - forcePassthruRead=true>; - } -} - multiclass VPseudoVNCVTD_W { defvar constraint = "@earlyclobber $rd"; foreach m = MxListFW in { @@ -4450,21 +4359,22 @@ class VPatTernaryNoMaskWithPolicy : + DAGOperand op2_kind, + bit isSEWAware = false> : Pat<(result_type (!cast(intrinsic) (result_type result_reg_class:$rs3), (op1_type op1_reg_class:$rs1), (op2_type op2_kind:$rs2), VLOpFrag, (XLenVT timm:$policy))), - (!cast(inst#"_"#kind#"_"#vlmul.MX) + (!cast(inst#"_"#kind#"_"#vlmul.MX#!if(isSEWAware, "_E"#!shl(1, log2sew), "")) result_reg_class:$rs3, (op1_type op1_reg_class:$rs1), op2_kind:$rs2, - GPR:$vl, sew, (XLenVT timm:$policy))>; + GPR:$vl, log2sew, (XLenVT timm:$policy))>; class VPatTernaryNoMaskWithPolicyRoundingMode, SDTCisVT<6, XLenVT>]>; // Float -> Int -def riscv_vfcvt_xu_f_vl : SDNode<"RISCVISD::VFCVT_XU_F_VL", SDT_RISCVFP2IOp_VL>; -def riscv_vfcvt_x_f_vl : SDNode<"RISCVISD::VFCVT_X_F_VL", SDT_RISCVFP2IOp_VL>; def riscv_vfcvt_rm_xu_f_vl : SDNode<"RISCVISD::VFCVT_RM_XU_F_VL", SDT_RISCVFP2IOp_RM_VL>; def riscv_vfcvt_rm_x_f_vl : SDNode<"RISCVISD::VFCVT_RM_X_F_VL", SDT_RISCVFP2IOp_RM_VL>; @@ -1206,24 +1204,6 @@ multiclass VPatConvertFP2IVL_V { } } -multiclass VPatConvertFP2IVL_V_RM { - foreach fvti = AllFloatVectors in { - defvar ivti = GetIntVTypeInfo.Vti; - let Predicates = !listconcat(GetVTypePredicates.Predicates, - GetVTypePredicates.Predicates) in - def : Pat<(ivti.Vector (vop (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask V0), - VLOpFrag)), - (!cast(instruction_name#"_"#ivti.LMul.MX#"_MASK") - (ivti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask V0), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, ivti.Log2SEW, TA_MA)>; - } -} - multiclass VPatConvertFP2I_RM_VL_V { foreach fvti = AllFloatVectors in { @@ -1289,25 +1269,6 @@ multiclass VPatWConvertFP2IVL_V } } -multiclass VPatWConvertFP2IVL_V_RM { - foreach fvtiToFWti = AllWidenableFloatVectors in { - defvar fvti = fvtiToFWti.Vti; - defvar iwti = GetIntVTypeInfo.Vti; - let Predicates = !listconcat(GetVTypePredicates.Predicates, - GetVTypePredicates.Predicates) in - def : Pat<(iwti.Vector (vop (fvti.Vector fvti.RegClass:$rs1), - (fvti.Mask V0), - VLOpFrag)), - (!cast(instruction_name#"_"#fvti.LMul.MX#"_MASK") - (iwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1, - (fvti.Mask V0), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, fvti.Log2SEW, TA_MA)>; - } -} - multiclass VPatWConvertFP2I_RM_VL_V { foreach fvtiToFWti = AllWidenableFloatVectors in { @@ -1361,28 +1322,6 @@ multiclass VPatNConvertFP2IVL_W { - // Reuse the same list of types used in the widening nodes, but just swap the - // direction of types around so we're converting from Wti -> Vti - foreach vtiToWti = AllWidenableIntToFloatVectors in { - defvar vti = vtiToWti.Vti; - defvar fwti = vtiToWti.Wti; - let Predicates = !listconcat(GetVTypePredicates.Predicates, - GetVTypePredicates.Predicates) in - def : Pat<(vti.Vector (vop (fwti.Vector fwti.RegClass:$rs1), - (fwti.Mask V0), - VLOpFrag)), - (!cast(instruction_name#"_"#vti.LMul.MX#"_MASK") - (vti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1, - (fwti.Mask V0), - // Value to indicate no rounding mode change in - // RISCVInsertReadWriteCSR - FRM_DYN, - GPR:$vl, vti.Log2SEW, TA_MA)>; - } -} - multiclass VPatNConvertFP2I_RM_VL_W { foreach vtiToWti = AllWidenableIntToFloatVectors in { defvar vti = vtiToWti.Vti; @@ -2637,10 +2576,8 @@ foreach fvti = AllFloatVectors in { } // 13.17. Vector Single-Width Floating-Point/Integer Type-Convert Instructions -defm : VPatConvertFP2IVL_V_RM; -defm : VPatConvertFP2IVL_V_RM; -defm : VPatConvertFP2I_RM_VL_V; -defm : VPatConvertFP2I_RM_VL_V; +defm : VPatConvertFP2I_RM_VL_V; +defm : VPatConvertFP2I_RM_VL_V; defm : VPatConvertFP2IVL_V; defm : VPatConvertFP2IVL_V; @@ -2648,14 +2585,12 @@ defm : VPatConvertFP2IVL_V; defm : VPatConvertI2FPVL_V_RM; defm : VPatConvertI2FPVL_V_RM; -defm : VPatConvertI2FP_RM_VL_V; -defm : VPatConvertI2FP_RM_VL_V; +defm : VPatConvertI2FP_RM_VL_V; +defm : VPatConvertI2FP_RM_VL_V; // 13.18. Widening Floating-Point/Integer Type-Convert Instructions -defm : VPatWConvertFP2IVL_V_RM; -defm : VPatWConvertFP2IVL_V_RM; -defm : VPatWConvertFP2I_RM_VL_V; -defm : VPatWConvertFP2I_RM_VL_V; +defm : VPatWConvertFP2I_RM_VL_V; +defm : VPatWConvertFP2I_RM_VL_V; defm : VPatWConvertFP2IVL_V; defm : VPatWConvertFP2IVL_V; @@ -2694,10 +2629,8 @@ foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in { } // 13.19 Narrowing Floating-Point/Integer Type-Convert Instructions -defm : VPatNConvertFP2IVL_W_RM; -defm : VPatNConvertFP2IVL_W_RM; -defm : VPatNConvertFP2I_RM_VL_W; -defm : VPatNConvertFP2I_RM_VL_W; +defm : VPatNConvertFP2I_RM_VL_W; +defm : VPatNConvertFP2I_RM_VL_W; defm : VPatNConvertFP2IVL_W; defm : VPatNConvertFP2IVL_W; @@ -2705,8 +2638,8 @@ defm : VPatNConvertFP2IVL_W; defm : VPatNConvertI2FPVL_W_RM; -defm : VPatNConvertI2FP_RM_VL_W; -defm : VPatNConvertI2FP_RM_VL_W; +defm : VPatNConvertI2FP_RM_VL_W; +defm : VPatNConvertI2FP_RM_VL_W; foreach fvtiToFWti = AllWidenableFloatVectors in { defvar fvti = fvtiToFWti.Vti; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 81467ada00448..1ad3e1b681466 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -222,7 +222,8 @@ let Predicates = [HasVendorXSfvfwmaccqqq], DecoderNamespace = "XSfvfwmaccqqq", def VFWMACC_4x4x4 : CustomSiFiveVMACC<0b111100, OPFVV, "sf.vfwmacc.4x4x4">; } -let Predicates = [HasVendorXSfvfnrclipxfqf], DecoderNamespace = "XSfvfnrclipxfqf" in { +let Predicates = [HasVendorXSfvfnrclipxfqf], DecoderNamespace = "XSfvfnrclipxfqf", + Uses = [FRM] in { def VFNRCLIP_XU_F_QF : CustomSiFiveVFNRCLIP<0b100010, OPFVF, "sf.vfnrclip.xu.f.qf">; def VFNRCLIP_X_F_QF : CustomSiFiveVFNRCLIP<0b100011, OPFVF, "sf.vfnrclip.x.f.qf">; } @@ -405,7 +406,7 @@ multiclass VPseudoSiFiveVFWMACC { multiclass VPseudoSiFiveVFNRCLIP { foreach i = 0-4 in - let hasSideEffects = 0 in + let hasSideEffects = 0, hasPostISelHook = 1 in defm "Pseudo" # NAME : VPseudoBinaryRoundingMode { - let VLMul = MInfo.value in - def "_" # MInfo.MX : VPseudoTernaryNoMask_Zvk; + LMULInfo MInfo, int sew = 0> { + let VLMul = MInfo.value, SEW = sew in { + defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); + def suffix : VPseudoTernaryNoMask_Zvk; + } } multiclass VPseudoBinaryV_V_NoMask_Zvk { @@ -348,12 +350,12 @@ multiclass VPseudoVSHA2CL { } } -multiclass VPseudoVSHA2MS { - foreach m = MxListVF4 in { +multiclass VPseudoVSHA2MS { + foreach m = !if(!eq(sew, 64), MxListVF8, MxListVF4) in { defvar mx = m.MX; - defm _VV : VPseudoTernaryNoMask_Zvk, + defm _VV : VPseudoTernaryNoMask_Zvk, SchedTernary<"WriteVSHA2MSV", "ReadVSHA2MSV", "ReadVSHA2MSV", - "ReadVSHA2MSV", mx>; + "ReadVSHA2MSV", mx, sew>; } } @@ -564,7 +566,9 @@ let Predicates = [HasStdExtZvkned] in { let Predicates = [HasStdExtZvknhaOrZvknhb] in { defm PseudoVSHA2CH : VPseudoVSHA2CH; defm PseudoVSHA2CL : VPseudoVSHA2CL; - defm PseudoVSHA2MS : VPseudoVSHA2MS; + defm PseudoVSHA2MS : VPseudoVSHA2MS; + let Predicates = [HasStdExtZvknhb] in + defm PseudoVSHA2MS : VPseudoVSHA2MS; } // Predicates = [HasStdExtZvknhaOrZvknhb] let Predicates = [HasStdExtZvksed] in { @@ -944,12 +948,14 @@ multiclass VPatUnaryV_V_S_NoMask_Zvk vtilist> { + list vtilist, + bit isSEWAware = false> { foreach vti = vtilist in def : VPatTernaryNoMaskWithPolicy; + vti.RegClass, vti.RegClass, + isSEWAware = isSEWAware>; } multiclass VPatBinaryV_VI_NoMask; defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors, isSEWAware=true>; } // Predicates = [HasStdExtZvknha] let Predicates = [HasStdExtZvknhb] in { defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32I64IntegerVectors>; defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32I64IntegerVectors>; - defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors>; + defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors, isSEWAware=true>; } // Predicates = [HasStdExtZvknhb] let Predicates = [HasStdExtZvksed] in { diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index d07ee393bbcfd..32fddeead3449 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -223,7 +223,7 @@ def SiFive7VS : ProcResource<1>; // Store sequencer // The VCQ accepts instructions from the the A Pipe and holds them until the // vector unit is ready to dequeue them. The unit dequeues up to one instruction // per cycle, in order, as soon as the sequencer for that type of instruction is -// avaliable. This resource is meant to be used for 1 cycle by all vector +// available. This resource is meant to be used for 1 cycle by all vector // instructions, to model that only one vector instruction may be dequed at a // time. The actual dequeueing into the sequencer is modeled by the VA, VL, and // VS sequencer resources below. Each of them will only accept a single diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td index 7a54d2fe10808..1af89903e0068 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td @@ -883,7 +883,8 @@ foreach mx = SchedMxList in { let Latency = 3, ReleaseAtCycles = [LMulLat] in { defm "" : LMULWriteResMX<"WriteVSHA2CHV", [SiFiveP400VEXQ0], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVSHA2CLV", [SiFiveP400VEXQ0], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSHA2MSV", [SiFiveP400VEXQ0], mx, IsWorstCase>; + foreach sew = !listremove(SchedSEWSet.val, [8, 16]) in + defm "" : LMULSEWWriteResMXSEW<"WriteVSHA2MSV", [SiFiveP400VEXQ0], mx, sew, IsWorstCase>; } // Zvkned let Latency = 2, ReleaseAtCycles = [LMulLat] in { @@ -1213,7 +1214,7 @@ defm "" : LMULReadAdvance<"ReadVGMULV", 0>; // Zvknha or Zvknhb defm "" : LMULReadAdvance<"ReadVSHA2CHV", 0>; defm "" : LMULReadAdvance<"ReadVSHA2CLV", 0>; -defm "" : LMULReadAdvance<"ReadVSHA2MSV", 0>; +defm "" : LMULSEWReadAdvance<"ReadVSHA2MSV", 0>; // Zvkned defm "" : LMULReadAdvance<"ReadVAESMVV", 0>; defm "" : LMULReadAdvance<"ReadVAESKF1V", 0>; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td index c685a6d2b094b..51aa003139fba 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td @@ -791,7 +791,8 @@ foreach mx = SchedMxList in { let Latency = 3, ReleaseAtCycles = [LMulLat] in { defm "" : LMULWriteResMX<"WriteVSHA2CHV", [SiFiveP600VectorArith], mx, IsWorstCase>; defm "" : LMULWriteResMX<"WriteVSHA2CLV", [SiFiveP600VectorArith], mx, IsWorstCase>; - defm "" : LMULWriteResMX<"WriteVSHA2MSV", [SiFiveP600VectorArith], mx, IsWorstCase>; + foreach sew = !listremove(SchedSEWSet.val, [8, 16]) in + defm "" : LMULSEWWriteResMXSEW<"WriteVSHA2MSV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>; } // Zvkned let Latency = 2, ReleaseAtCycles = [LMulLat] in { @@ -1119,7 +1120,7 @@ defm "" : LMULReadAdvance<"ReadVGMULV", 0>; // Zvknha or Zvknhb defm "" : LMULReadAdvance<"ReadVSHA2CHV", 0>; defm "" : LMULReadAdvance<"ReadVSHA2CLV", 0>; -defm "" : LMULReadAdvance<"ReadVSHA2MSV", 0>; +defm "" : LMULSEWReadAdvance<"ReadVSHA2MSV", 0>; // Zvkned defm "" : LMULReadAdvance<"ReadVAESMVV", 0>; defm "" : LMULReadAdvance<"ReadVAESKF1V", 0>; diff --git a/llvm/lib/Target/RISCV/RISCVScheduleZvk.td b/llvm/lib/Target/RISCV/RISCVScheduleZvk.td index 640c456322f02..62d9bab0fac85 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleZvk.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleZvk.td @@ -36,7 +36,7 @@ defm "" : LMULSchedWrites<"WriteVGMULV">; /// Zvknha or Zvknhb extensions defm "" : LMULSchedWrites<"WriteVSHA2CHV">; defm "" : LMULSchedWrites<"WriteVSHA2CLV">; -defm "" : LMULSchedWrites<"WriteVSHA2MSV">; +defm "" : LMULSEWSchedWrites<"WriteVSHA2MSV">; /// Zvkned extension defm "" : LMULSchedWrites<"WriteVAESMVV">; @@ -79,7 +79,7 @@ defm "" : LMULSchedReads<"ReadVGMULV">; /// Zvknha or Zvknhb extensions defm "" : LMULSchedReads<"ReadVSHA2CHV">; defm "" : LMULSchedReads<"ReadVSHA2CLV">; -defm "" : LMULSchedReads<"ReadVSHA2MSV">; +defm "" : LMULSEWSchedReads<"ReadVSHA2MSV">; /// Zvkned extension defm "" : LMULSchedReads<"ReadVAESMVV">; @@ -153,11 +153,11 @@ multiclass UnsupportedSchedZvknhaOrZvknhb { let Unsupported = true in { defm "" : LMULWriteRes<"WriteVSHA2CHV", []>; defm "" : LMULWriteRes<"WriteVSHA2CLV", []>; -defm "" : LMULWriteRes<"WriteVSHA2MSV", []>; +defm "" : LMULSEWWriteRes<"WriteVSHA2MSV", []>; defm "" : LMULReadAdvance<"ReadVSHA2CHV", 0>; defm "" : LMULReadAdvance<"ReadVSHA2CLV", 0>; -defm "" : LMULReadAdvance<"ReadVSHA2MSV", 0>; +defm "" : LMULSEWReadAdvance<"ReadVSHA2MSV", 0>; } } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 395baa5f1aab9..b84c94f718b28 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -723,8 +723,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( // The interleaved memory access pass will lower interleaved memory ops (i.e // a load and store followed by a specific shuffle) to vlseg/vsseg - // intrinsics. In those cases then we can treat it as if it's just one (legal) - // memory op + // intrinsics. if (!UseMaskForCond && !UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { auto *VTy = cast(VecTy); @@ -734,19 +733,27 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( auto *SubVecTy = VectorType::get(VTy->getElementType(), VTy->getElementCount().divideCoefficientBy(Factor)); - if (VTy->getElementCount().isKnownMultipleOf(Factor) && TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment, AddressSpace, DL)) { - // FIXME: We use the memory op cost of the *legalized* type here, - // because it's getMemoryOpCost returns a really expensive cost for - // types like <6 x i8>, which show up when doing interleaves of - // Factor=3 etc. Should the memory op cost of these be cheaper? - auto *LegalVTy = VectorType::get(VTy->getElementType(), - LT.second.getVectorElementCount()); - InstructionCost LegalMemCost = getMemoryOpCost( - Opcode, LegalVTy, Alignment, AddressSpace, CostKind); - return LT.first + LegalMemCost; + + // Most available hardware today optimizes NF=2 as as one wide memory op + // + Factor * LMUL shuffle ops. + if (Factor == 2) { + InstructionCost Cost = + getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind); + MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT(); + Cost += Factor * TLI->getLMULCost(SubVecVT); + return LT.first * Cost; + } + + // Otherwise, the cost is proportional to the number of elements (VL * + // Factor ops). + InstructionCost MemOpCost = + getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0, + CostKind, {TTI::OK_AnyValue, TTI::OP_None}); + unsigned NumLoads = getEstimatedVLFor(VTy); + return NumLoads * MemOpCost; } } } @@ -948,12 +955,17 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { auto *RetTy = ICA.getReturnType(); switch (ICA.getID()) { + case Intrinsic::lrint: + case Intrinsic::llrint: + // We can't currently lower half or bfloat vector lrint/llrint. + if (auto *VecTy = dyn_cast(ICA.getArgTypes()[0]); + VecTy && VecTy->getElementType()->is16bitFPTy()) + return InstructionCost::getInvalid(); + [[fallthrough]]; case Intrinsic::ceil: case Intrinsic::floor: case Intrinsic::trunc: case Intrinsic::rint: - case Intrinsic::lrint: - case Intrinsic::llrint: case Intrinsic::round: case Intrinsic::roundeven: { // These all use the same code. @@ -1552,13 +1564,6 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, } // IR Reduction is composed by two vmv and one rvv reduction instruction. - if (TTI::requiresOrderedReduction(FMF)) { - Opcodes.push_back(RISCV::VFMV_S_F); - for (unsigned i = 0; i < LT.first.getValue(); i++) - Opcodes.push_back(RISCV::VFREDOSUM_VS); - Opcodes.push_back(RISCV::VFMV_F_S); - return getRISCVInstructionCost(Opcodes, LT.second, CostKind); - } unsigned SplitOp; switch (ISD) { case ISD::ADD: @@ -1582,7 +1587,14 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, if ((LT.second.getVectorElementType() == MVT::f16 && !ST->hasVInstructionsF16()) || LT.second.getVectorElementType() == MVT::bf16) - return InstructionCost::getInvalid(); + return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); + if (TTI::requiresOrderedReduction(FMF)) { + Opcodes.push_back(RISCV::VFMV_S_F); + for (unsigned i = 0; i < LT.first.getValue(); i++) + Opcodes.push_back(RISCV::VFREDOSUM_VS); + Opcodes.push_back(RISCV::VFMV_F_S); + return getRISCVInstructionCost(Opcodes, LT.second, CostKind); + } SplitOp = RISCV::VFADD_VV; Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S}; break; @@ -2286,6 +2298,23 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, C2.ScaleCost, C2.ImmCost, C2.SetupCost); } +bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) { + auto *VTy = dyn_cast(DataTy); + if (!VTy || VTy->isScalableTy()) + return false; + + if (!isLegalMaskedLoadStore(DataTy, Alignment)) + return false; + + // FIXME: If it is an i8 vector and the element count exceeds 256, we should + // scalarize these types with LMUL >= maximum fixed-length LMUL. + if (VTy->getElementType()->isIntegerTy(8)) + if (VTy->getElementCount().getFixedValue() > 256) + return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() < + ST->getMaxLMULForFixedLengthVectors(); + return true; +} + bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { auto *VTy = dyn_cast(DataTy); if (!VTy || VTy->isScalableTy()) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 13d28e4db49cd..29a6c68a6c585 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -301,6 +301,8 @@ class RISCVTTIImpl : public BasicTTIImplBase { DL); } + bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment); + bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment); bool isVScaleKnownToBeAPowerOfTwo() const { diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt index 326343ae27814..aa83d997578fd 100644 --- a/llvm/lib/Target/SPIRV/CMakeLists.txt +++ b/llvm/lib/Target/SPIRV/CMakeLists.txt @@ -56,6 +56,7 @@ add_llvm_target(SPIRVCodeGen MC SPIRVDesc SPIRVInfo + ScalarOpts SelectionDAG Support Target diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 64fde8bf67ab9..62bd8d1f9d243 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -713,21 +713,36 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( return Reg; } +static std::string GetSpirvImageTypeName(const SPIRVType *Type, + MachineIRBuilder &MIRBuilder, + const std::string &Prefix); + static std::string buildSpirvTypeName(const SPIRVType *Type, MachineIRBuilder &MIRBuilder) { switch (Type->getOpcode()) { + case SPIRV::OpTypeSampledImage: { + return GetSpirvImageTypeName(Type, MIRBuilder, "sampled_image_"); + } case SPIRV::OpTypeImage: { - Register SampledTypeReg = Type->getOperand(1).getReg(); - auto *SampledType = MIRBuilder.getMRI()->getUniqueVRegDef(SampledTypeReg); - std::string TypeName = - "image_" + buildSpirvTypeName(SampledType, MIRBuilder); - for (uint32_t I = 2; I < Type->getNumOperands(); ++I) { - TypeName = (TypeName + '_' + Twine(Type->getOperand(I).getImm())).str(); - } - return TypeName; + return GetSpirvImageTypeName(Type, MIRBuilder, "image_"); + } + case SPIRV::OpTypeArray: { + MachineRegisterInfo *MRI = MIRBuilder.getMRI(); + Register ElementTypeReg = Type->getOperand(1).getReg(); + auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg); + const SPIRVType *TypeInst = MRI->getVRegDef(Type->getOperand(2).getReg()); + assert(TypeInst->getOpcode() != SPIRV::OpConstantI); + MachineInstr *ImmInst = MRI->getVRegDef(TypeInst->getOperand(1).getReg()); + assert(ImmInst->getOpcode() == TargetOpcode::G_CONSTANT); + uint32_t ArraySize = ImmInst->getOperand(1).getCImm()->getZExtValue(); + return (buildSpirvTypeName(ElementType, MIRBuilder) + Twine("[") + + Twine(ArraySize) + Twine("]")) + .str(); } case SPIRV::OpTypeFloat: return ("f" + Twine(Type->getOperand(1).getImm())).str(); + case SPIRV::OpTypeSampler: + return ("sampler"); case SPIRV::OpTypeInt: if (Type->getOperand(2).getImm()) return ("i" + Twine(Type->getOperand(1).getImm())).str(); @@ -737,6 +752,18 @@ static std::string buildSpirvTypeName(const SPIRVType *Type, } } +static std::string GetSpirvImageTypeName(const SPIRVType *Type, + MachineIRBuilder &MIRBuilder, + const std::string &Prefix) { + Register SampledTypeReg = Type->getOperand(1).getReg(); + auto *SampledType = MIRBuilder.getMRI()->getUniqueVRegDef(SampledTypeReg); + std::string TypeName = Prefix + buildSpirvTypeName(SampledType, MIRBuilder); + for (uint32_t I = 2; I < Type->getNumOperands(); ++I) { + TypeName = (TypeName + '_' + Twine(Type->getOperand(I).getImm())).str(); + } + return TypeName; +} + Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding( const SPIRVType *VarType, uint32_t Set, uint32_t Binding, MachineIRBuilder &MIRBuilder) { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 11ed7d660be09..526305d7ed28a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -260,6 +260,7 @@ class SPIRVInstructionSelector : public InstructionSelector { SPIRVType *SrcPtrTy) const; Register buildPointerToResource(const SPIRVType *ResType, uint32_t Set, uint32_t Binding, uint32_t ArraySize, + Register IndexReg, bool IsNonUniform, MachineIRBuilder MIRBuilder) const; }; @@ -2616,10 +2617,15 @@ void SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg, uint32_t Set = foldImm(I.getOperand(2), MRI); uint32_t Binding = foldImm(I.getOperand(3), MRI); uint32_t ArraySize = foldImm(I.getOperand(4), MRI); + Register IndexReg = I.getOperand(5).getReg(); + bool IsNonUniform = ArraySize > 1 && foldImm(I.getOperand(6), MRI); MachineIRBuilder MIRBuilder(I); - Register VarReg = - buildPointerToResource(ResType, Set, Binding, ArraySize, MIRBuilder); + Register VarReg = buildPointerToResource(ResType, Set, Binding, ArraySize, + IndexReg, IsNonUniform, MIRBuilder); + + if (IsNonUniform) + buildOpDecorate(ResVReg, I, TII, SPIRV::Decoration::NonUniformEXT, {}); // TODO: For now we assume the resource is an image, which needs to be // loaded to get the handle. That will not be true for storage buffers. @@ -2631,10 +2637,35 @@ void SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg, Register SPIRVInstructionSelector::buildPointerToResource( const SPIRVType *ResType, uint32_t Set, uint32_t Binding, - uint32_t ArraySize, MachineIRBuilder MIRBuilder) const { - assert(ArraySize == 1 && "Resource arrays are not implemented yet."); - return GR.getOrCreateGlobalVariableWithBinding(ResType, Set, Binding, - MIRBuilder); + uint32_t ArraySize, Register IndexReg, bool IsNonUniform, + MachineIRBuilder MIRBuilder) const { + if (ArraySize == 1) + return GR.getOrCreateGlobalVariableWithBinding(ResType, Set, Binding, + MIRBuilder); + + const SPIRVType *VarType = GR.getOrCreateSPIRVArrayType( + ResType, ArraySize, *MIRBuilder.getInsertPt(), TII); + Register VarReg = GR.getOrCreateGlobalVariableWithBinding( + VarType, Set, Binding, MIRBuilder); + + SPIRVType *ResPointerType = GR.getOrCreateSPIRVPointerType( + ResType, MIRBuilder, SPIRV::StorageClass::UniformConstant); + + Register AcReg = MRI->createVirtualRegister(&SPIRV::iIDRegClass); + if (IsNonUniform) { + // It is unclear which value needs to be marked an non-uniform, so both + // the index and the access changed are decorated as non-uniform. + buildOpDecorate(IndexReg, MIRBuilder, SPIRV::Decoration::NonUniformEXT, {}); + buildOpDecorate(AcReg, MIRBuilder, SPIRV::Decoration::NonUniformEXT, {}); + } + + MIRBuilder.buildInstr(SPIRV::OpAccessChain) + .addDef(AcReg) + .addUse(GR.getSPIRVTypeID(ResPointerType)) + .addUse(VarReg) + .addUse(IndexReg); + + return AcReg; } bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg, diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index db5463f5c7abb..29ce60d9983e3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -689,11 +689,31 @@ void RequirementHandler::initAvailableCapabilitiesForVulkan( const SPIRVSubtarget &ST) { addAvailableCaps({Capability::Shader, Capability::Linkage}); - // Provided by all supported Vulkan versions. + // Core in Vulkan 1.1 and earlier. addAvailableCaps({Capability::Int16, Capability::Int64, Capability::Float16, Capability::Float64, Capability::GroupNonUniform, Capability::Image1D, Capability::SampledBuffer, - Capability::ImageBuffer}); + Capability::ImageBuffer, + Capability::UniformBufferArrayDynamicIndexing, + Capability::SampledImageArrayDynamicIndexing, + Capability::StorageBufferArrayDynamicIndexing, + Capability::StorageImageArrayDynamicIndexing}); + + // Became core in Vulkan 1.2 + if (ST.isAtLeastSPIRVVer(VersionTuple(1, 5))) { + addAvailableCaps( + {Capability::ShaderNonUniformEXT, Capability::RuntimeDescriptorArrayEXT, + Capability::InputAttachmentArrayDynamicIndexingEXT, + Capability::UniformTexelBufferArrayDynamicIndexingEXT, + Capability::StorageTexelBufferArrayDynamicIndexingEXT, + Capability::UniformBufferArrayNonUniformIndexingEXT, + Capability::SampledImageArrayNonUniformIndexingEXT, + Capability::StorageBufferArrayNonUniformIndexingEXT, + Capability::StorageImageArrayNonUniformIndexingEXT, + Capability::InputAttachmentArrayNonUniformIndexingEXT, + Capability::UniformTexelBufferArrayNonUniformIndexingEXT, + Capability::StorageTexelBufferArrayNonUniformIndexingEXT}); + } } } // namespace SPIRV @@ -729,6 +749,8 @@ static void addOpDecorateReqs(const MachineInstr &MI, unsigned DecIndex, Dec == SPIRV::Decoration::ImplementInRegisterMapINTEL) { Reqs.addExtension( SPIRV::Extension::SPV_INTEL_global_variable_fpga_decorations); + } else if (Dec == SPIRV::Decoration::NonUniformEXT) { + Reqs.addRequirements(SPIRV::Capability::ShaderNonUniformEXT); } } @@ -848,6 +870,136 @@ static void AddAtomicFloatRequirements(const MachineInstr &MI, } } +bool isUniformTexelBuffer(MachineInstr *ImageInst) { + if (ImageInst->getOpcode() != SPIRV::OpTypeImage) + return false; + uint32_t Dim = ImageInst->getOperand(2).getImm(); + uint32_t Sampled = ImageInst->getOperand(6).getImm(); + return Dim == SPIRV::Dim::DIM_Buffer && Sampled == 1; +} + +bool isStorageTexelBuffer(MachineInstr *ImageInst) { + if (ImageInst->getOpcode() != SPIRV::OpTypeImage) + return false; + uint32_t Dim = ImageInst->getOperand(2).getImm(); + uint32_t Sampled = ImageInst->getOperand(6).getImm(); + return Dim == SPIRV::Dim::DIM_Buffer && Sampled == 2; +} + +bool isSampledImage(MachineInstr *ImageInst) { + if (ImageInst->getOpcode() != SPIRV::OpTypeImage) + return false; + uint32_t Dim = ImageInst->getOperand(2).getImm(); + uint32_t Sampled = ImageInst->getOperand(6).getImm(); + return Dim != SPIRV::Dim::DIM_Buffer && Sampled == 1; +} + +bool isInputAttachment(MachineInstr *ImageInst) { + if (ImageInst->getOpcode() != SPIRV::OpTypeImage) + return false; + uint32_t Dim = ImageInst->getOperand(2).getImm(); + uint32_t Sampled = ImageInst->getOperand(6).getImm(); + return Dim == SPIRV::Dim::DIM_SubpassData && Sampled == 2; +} + +bool isStorageImage(MachineInstr *ImageInst) { + if (ImageInst->getOpcode() != SPIRV::OpTypeImage) + return false; + uint32_t Dim = ImageInst->getOperand(2).getImm(); + uint32_t Sampled = ImageInst->getOperand(6).getImm(); + return Dim != SPIRV::Dim::DIM_Buffer && Sampled == 2; +} + +bool isCombinedImageSampler(MachineInstr *SampledImageInst) { + if (SampledImageInst->getOpcode() != SPIRV::OpTypeSampledImage) + return false; + + const MachineRegisterInfo &MRI = SampledImageInst->getMF()->getRegInfo(); + Register ImageReg = SampledImageInst->getOperand(1).getReg(); + auto *ImageInst = MRI.getUniqueVRegDef(ImageReg); + return isSampledImage(ImageInst); +} + +bool hasNonUniformDecoration(Register Reg, const MachineRegisterInfo &MRI) { + for (const auto &MI : MRI.reg_instructions(Reg)) { + if (MI.getOpcode() != SPIRV::OpDecorate) + continue; + + uint32_t Dec = MI.getOperand(1).getImm(); + if (Dec == SPIRV::Decoration::NonUniformEXT) + return true; + } + return false; +} + +void addOpAccessChainReqs(const MachineInstr &Instr, + SPIRV::RequirementHandler &Handler, + const SPIRVSubtarget &Subtarget) { + const MachineRegisterInfo &MRI = Instr.getMF()->getRegInfo(); + // Get the result type. If it is an image type, then the shader uses + // descriptor indexing. The appropriate capabilities will be added based + // on the specifics of the image. + Register ResTypeReg = Instr.getOperand(1).getReg(); + MachineInstr *ResTypeInst = MRI.getUniqueVRegDef(ResTypeReg); + + assert(ResTypeInst->getOpcode() == SPIRV::OpTypePointer); + uint32_t StorageClass = ResTypeInst->getOperand(1).getImm(); + if (StorageClass != SPIRV::StorageClass::StorageClass::UniformConstant && + StorageClass != SPIRV::StorageClass::StorageClass::Uniform && + StorageClass != SPIRV::StorageClass::StorageClass::StorageBuffer) { + return; + } + + Register PointeeTypeReg = ResTypeInst->getOperand(2).getReg(); + MachineInstr *PointeeType = MRI.getUniqueVRegDef(PointeeTypeReg); + if (PointeeType->getOpcode() != SPIRV::OpTypeImage && + PointeeType->getOpcode() != SPIRV::OpTypeSampledImage && + PointeeType->getOpcode() != SPIRV::OpTypeSampler) { + return; + } + + bool IsNonUniform = + hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI); + if (isUniformTexelBuffer(PointeeType)) { + if (IsNonUniform) + Handler.addRequirements( + SPIRV::Capability::UniformTexelBufferArrayNonUniformIndexingEXT); + else + Handler.addRequirements( + SPIRV::Capability::UniformTexelBufferArrayDynamicIndexingEXT); + } else if (isInputAttachment(PointeeType)) { + if (IsNonUniform) + Handler.addRequirements( + SPIRV::Capability::InputAttachmentArrayNonUniformIndexingEXT); + else + Handler.addRequirements( + SPIRV::Capability::InputAttachmentArrayDynamicIndexingEXT); + } else if (isStorageTexelBuffer(PointeeType)) { + if (IsNonUniform) + Handler.addRequirements( + SPIRV::Capability::StorageTexelBufferArrayNonUniformIndexingEXT); + else + Handler.addRequirements( + SPIRV::Capability::StorageTexelBufferArrayDynamicIndexingEXT); + } else if (isSampledImage(PointeeType) || + isCombinedImageSampler(PointeeType) || + PointeeType->getOpcode() == SPIRV::OpTypeSampler) { + if (IsNonUniform) + Handler.addRequirements( + SPIRV::Capability::SampledImageArrayNonUniformIndexingEXT); + else + Handler.addRequirements( + SPIRV::Capability::SampledImageArrayDynamicIndexing); + } else if (isStorageImage(PointeeType)) { + if (IsNonUniform) + Handler.addRequirements( + SPIRV::Capability::StorageImageArrayNonUniformIndexingEXT); + else + Handler.addRequirements( + SPIRV::Capability::StorageImageArrayDynamicIndexing); + } +} + void addInstrRequirements(const MachineInstr &MI, SPIRV::RequirementHandler &Reqs, const SPIRVSubtarget &ST) { @@ -967,11 +1119,17 @@ void addInstrRequirements(const MachineInstr &MI, case SPIRV::OpConstantSampler: Reqs.addCapability(SPIRV::Capability::LiteralSampler); break; + case SPIRV::OpInBoundsAccessChain: + case SPIRV::OpAccessChain: + addOpAccessChainReqs(MI, Reqs, ST); + break; case SPIRV::OpTypeImage: addOpTypeImageReqs(MI, Reqs, ST); break; case SPIRV::OpTypeSampler: - Reqs.addCapability(SPIRV::Capability::ImageBasic); + if (!ST.isVulkanEnv()) { + Reqs.addCapability(SPIRV::Capability::ImageBasic); + } break; case SPIRV::OpTypeForwardPointer: // TODO: check if it's OpenCL's kernel. diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 3c2af34dd5523..cc34cf877dea9 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -165,6 +165,57 @@ static MachineInstr *findAssignTypeInstr(Register Reg, return nullptr; } +static void buildOpBitcast(SPIRVGlobalRegistry *GR, MachineIRBuilder &MIB, + Register ResVReg, Register OpReg) { + SPIRVType *ResType = GR->getSPIRVTypeForVReg(ResVReg); + SPIRVType *OpType = GR->getSPIRVTypeForVReg(OpReg); + assert(ResType && OpType && "Operand types are expected"); + if (!GR->isBitcastCompatible(ResType, OpType)) + report_fatal_error("incompatible result and operand types in a bitcast"); + MachineRegisterInfo *MRI = MIB.getMRI(); + if (!MRI->getRegClassOrNull(ResVReg)) + MRI->setRegClass(ResVReg, GR->getRegClass(ResType)); + MIB.buildInstr(SPIRV::OpBitcast) + .addDef(ResVReg) + .addUse(GR->getSPIRVTypeID(ResType)) + .addUse(OpReg); +} + +// We do instruction selections early instead of calling MIB.buildBitcast() +// generating the general op code G_BITCAST. When MachineVerifier validates +// G_BITCAST we see a check of a kind: if Source Type is equal to Destination +// Type then report error "bitcast must change the type". This doesn't take into +// account the notion of a typed pointer that is important for SPIR-V where a +// user may and should use bitcast between pointers with different pointee types +// (https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpBitcast). +// It's important for correct lowering in SPIR-V, because interpretation of the +// data type is not left to instructions that utilize the pointer, but encoded +// by the pointer declaration, and the SPIRV target can and must handle the +// declaration and use of pointers that specify the type of data they point to. +// It's not feasible to improve validation of G_BITCAST using just information +// provided by low level types of source and destination. Therefore we don't +// produce G_BITCAST as the general op code with semantics different from +// OpBitcast, but rather lower to OpBitcast immediately. As for now, the only +// difference would be that CombinerHelper couldn't transform known patterns +// around G_BUILD_VECTOR. See discussion +// in https://github.com/llvm/llvm-project/pull/110270 for even more context. +static void selectOpBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, + MachineIRBuilder MIB) { + SmallVector ToErase; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != TargetOpcode::G_BITCAST) + continue; + MIB.setInsertPt(*MI.getParent(), MI); + buildOpBitcast(GR, MIB, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + ToErase.push_back(&MI); + } + } + for (MachineInstr *MI : ToErase) + MI->eraseFromParent(); +} + static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineIRBuilder MIB) { // Get access to information about available extensions @@ -202,15 +253,6 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, } else { GR->assignSPIRVTypeToVReg(AssignedPtrType, Def, MF); MIB.buildBitcast(Def, Source); - // MachineVerifier requires that bitcast must change the type. - // Change AddressSpace if needed to hint that Def and Source points to - // different types: this doesn't change actual code generation. - LLT DefType = MRI->getType(Def); - if (DefType == MRI->getType(Source)) - MRI->setType(Def, - LLT::pointer((DefType.getAddressSpace() + 1) % - SPIRVSubtarget::MaxLegalAddressSpace, - GR->getPointerSize())); } } } @@ -1007,6 +1049,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { removeImplicitFallthroughs(MF, MIB); insertSpirvDecorations(MF, MIB); insertInlineAsm(MF, GR, ST, MIB); + selectOpBitcasts(MF, GR, MIB); return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 13ad1eb8e8b33..d63438baca7e7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -355,7 +355,9 @@ defm GeometryPointSize : CapabilityOperand<24, 0, 0, [], [Geometry]>; defm ImageGatherExtended : CapabilityOperand<25, 0, 0, [], [Shader]>; defm StorageImageMultisample : CapabilityOperand<27, 0, 0, [], [Shader]>; defm UniformBufferArrayDynamicIndexing : CapabilityOperand<28, 0, 0, [], [Shader]>; -defm SampledImageArrayDymnamicIndexing : CapabilityOperand<29, 0, 0, [], [Shader]>; +defm SampledImageArrayDynamicIndexing : CapabilityOperand<29, 0, 0, [], [Shader]>; +defm StorageBufferArrayDynamicIndexing : CapabilityOperand<30, 0, 0, [], [Shader]>; +defm StorageImageArrayDynamicIndexing : CapabilityOperand<31, 0, 0, [], [Shader]>; defm ClipDistance : CapabilityOperand<32, 0, 0, [], [Shader]>; defm CullDistance : CapabilityOperand<33, 0, 0, [], [Shader]>; defm SampleRateShading : CapabilityOperand<35, 0, 0, [], [Shader]>; diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 1fa2dbfb26fc2..3999b54de81b6 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -2676,10 +2676,7 @@ static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL, (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) { // Disable the nsw and nuw flags: the backend needs to handle // overflow as well during comparison elimination. - SDNodeFlags Flags = N->getFlags(); - Flags.setNoSignedWrap(false); - Flags.setNoUnsignedWrap(false); - N->setFlags(Flags); + N->dropFlags(SDNodeFlags::NoWrap); C.Op0 = SDValue(N, 0); C.Op1 = DAG.getConstant(0, DL, N->getValueType(0)); return; diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 03c333b90108e..07a00af881afe 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -623,6 +623,37 @@ struct X86Operand final : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(Reg)); } + bool isTILEPair() const { + return Kind == Register && + X86MCRegisterClasses[X86::TILERegClassID].contains(getReg()); + } + + void addTILEPairOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned Reg = getReg(); + switch (Reg) { + default: + llvm_unreachable("Invalid tile register!"); + case X86::TMM0: + case X86::TMM1: + Reg = X86::TMM0_TMM1; + break; + case X86::TMM2: + case X86::TMM3: + Reg = X86::TMM2_TMM3; + break; + case X86::TMM4: + case X86::TMM5: + Reg = X86::TMM4_TMM5; + break; + case X86::TMM6: + case X86::TMM7: + Reg = X86::TMM6_TMM7; + break; + } + Inst.addOperand(MCOperand::createReg(Reg)); + } + void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); if (getMemBaseReg()) diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index ee1c8144f681e..f198234f1ca30 100644 --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -806,6 +806,10 @@ static int readModRM(struct InternalInstruction *insn) { if (index > 7) \ *valid = 0; \ return prefix##_TMM0 + index; \ + case TYPE_TMM_PAIR: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_TMM0_TMM1 + (index / 2); \ case TYPE_VK: \ index &= 0xf; \ if (index > 7) \ @@ -2315,6 +2319,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_YMM: case TYPE_ZMM: case TYPE_TMM: + case TYPE_TMM_PAIR: case TYPE_VK_PAIR: case TYPE_VK: case TYPE_DEBUGREG: diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index b0aa70be12d83..dc9af2caa77b1 100644 --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -535,6 +535,12 @@ namespace X86Disassembler { ENTRY(TMM6) \ ENTRY(TMM7) +#define REGS_TMM_PAIRS \ + ENTRY(TMM0_TMM1) \ + ENTRY(TMM2_TMM3) \ + ENTRY(TMM4_TMM5) \ + ENTRY(TMM6_TMM7) + #define ALL_EA_BASES \ EA_BASES_16BIT \ EA_BASES_32BIT \ @@ -559,6 +565,7 @@ namespace X86Disassembler { REGS_DEBUG \ REGS_CONTROL \ REGS_TMM \ + REGS_TMM_PAIRS \ ENTRY(RIP) /// All possible values of the base field for effective-address diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index e7ba13215feb5..51b82321d679b 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -463,3 +463,22 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, } llvm_unreachable("Unknown mask pair register name"); } + +void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + switch (MI->getOperand(OpNo).getReg()) { + case X86::TMM0_TMM1: + printRegName(OS, X86::TMM0); + return; + case X86::TMM2_TMM3: + printRegName(OS, X86::TMM2); + return; + case X86::TMM4_TMM5: + printRegName(OS, X86::TMM4); + return; + case X86::TMM6_TMM7: + printRegName(OS, X86::TMM6); + return; + } + llvm_unreachable("Unknown mask pair register name"); +} diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index 221102e17c653..2a7b750bd6752 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -38,6 +38,7 @@ class X86InstPrinterCommon : public MCInstPrinter { const MCSubtargetInfo &STI); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 6bedf9e1d13ac..160e7c0fc0310 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -270,6 +270,12 @@ def FeatureAMXFP16 : SubtargetFeature<"amx-fp16", "HasAMXFP16", "true", def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", "Support AMX-COMPLEX instructions", [FeatureAMXTILE]>; +def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", + "Support AMX-FP8 instructions", + [FeatureAMXTILE]>; +def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true", + "Support AMX amx-transpose instructions", + [FeatureAMXTILE]>; def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true", "Support CMPCCXADD instructions">; def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true", diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index f4c67f115c9f3..f832955d1202f 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -568,6 +568,131 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(Opc)); return true; } + // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding + // AMX instruction to support it. So, split it to 2 load instructions: + // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" --> + // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" + + // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment" + case X86::PTILEPAIRLOAD: { + int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm(); + Register TReg = MBBI->getOperand(0).getReg(); + bool DstIsDead = MBBI->getOperand(0).isDead(); + Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); + Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); + unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; + + MachineInstrBuilder MIBLo = + BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) + .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead)); + MachineInstrBuilder MIBHi = + BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD)) + .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead)); + + for (int i = 0; i < X86::AddrNumOperands; ++i) { + MIBLo.add(MBBI->getOperand(1 + i)); + if (i == X86::AddrDisp) + MIBHi.addImm(Disp + TmmSize); + else + MIBHi.add(MBBI->getOperand(1 + i)); + } + + // Make sure the first stride reg used in first tileload is alive. + MachineOperand &Stride = + MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg); + Stride.setIsKill(false); + + // Split the memory operand, adjusting the offset and size for the halves. + MachineMemOperand *OldMMO = MBBI->memoperands().front(); + MachineFunction *MF = MBB.getParent(); + MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); + MachineMemOperand *MMOHi = + MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); + + MIBLo.setMemRefs(MMOLo); + MIBHi.setMemRefs(MMOHi); + + // Delete the pseudo. + MBB.erase(MBBI); + return true; + } + // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no + // corresponding AMX instruction to support it. So, split it too: + // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" --> + // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" + + // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1" + case X86::PTILEPAIRSTORE: { + int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm(); + Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg(); + bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill(); + Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0); + Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1); + unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8; + + MachineInstrBuilder MIBLo = + BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); + MachineInstrBuilder MIBHi = + BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED)); + + for (int i = 0; i < X86::AddrNumOperands; ++i) { + MIBLo.add(MBBI->getOperand(i)); + if (i == X86::AddrDisp) + MIBHi.addImm(Disp + TmmSize); + else + MIBHi.add(MBBI->getOperand(i)); + } + MIBLo.addReg(TReg0, getKillRegState(SrcIsKill)); + MIBHi.addReg(TReg1, getKillRegState(SrcIsKill)); + + // Make sure the first stride reg used in first tilestore is alive. + MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg); + Stride.setIsKill(false); + + // Split the memory operand, adjusting the offset and size for the halves. + MachineMemOperand *OldMMO = MBBI->memoperands().front(); + MachineFunction *MF = MBB.getParent(); + MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize); + MachineMemOperand *MMOHi = + MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize); + + MIBLo.setMemRefs(MMOLo); + MIBHi.setMemRefs(MMOHi); + + // Delete the pseudo. + MBB.erase(MBBI); + return true; + } + case X86::PT2RPNTLVWZ0V: + case X86::PT2RPNTLVWZ0T1V: + case X86::PT2RPNTLVWZ1V: + case X86::PT2RPNTLVWZ1T1V: { + for (unsigned i = 3; i > 0; --i) + MI.removeOperand(i); + unsigned Opc; + switch (Opcode) { + case X86::PT2RPNTLVWZ0V: + Opc = X86::T2RPNTLVWZ0; + break; + case X86::PT2RPNTLVWZ0T1V: + Opc = X86::T2RPNTLVWZ0T1; + break; + case X86::PT2RPNTLVWZ1V: + Opc = X86::T2RPNTLVWZ1; + break; + case X86::PT2RPNTLVWZ1T1V: + Opc = X86::T2RPNTLVWZ1T1; + break; + default: + llvm_unreachable("Impossible Opcode!"); + } + MI.setDesc(TII->get(Opc)); + return true; + } + case X86::PTTRANSPOSEDV: { + for (int i = 2; i > 0; --i) + MI.removeOperand(i); + MI.setDesc(TII->get(X86::TTRANSPOSED)); + return true; + } case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: case X86::PTDPBSSDV: diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index d50a4d3b23ae2..62d0f6ca79434 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -268,24 +268,36 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI, << printReg(TileReg, TRI) << '\n'); } +static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) { + if (Reg.isVirtual()) { + unsigned RegClassID = MRI->getRegClass(Reg)->getID(); + if (RegClassID == X86::TILERegClassID) + return 1; + if (RegClassID == X86::TILEPAIRRegClassID) + return 2; + } else { + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return 1; + if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) + return 2; + } + return 0; +} + +static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) { + return getTileDefNum(MRI, VirtReg) > 0; +} + static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { // The instruction must have 3 operands: tile def, row, col. if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo()) return false; MachineOperand &MO = MI.getOperand(0); - if (MO.isReg()) { - Register Reg = MO.getReg(); - // FIXME it may be used after Greedy RA and the physical - // register is not rewritten yet. - if (Reg.isVirtual() && - MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) - return true; - if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return true; - } + if (!MO.isReg()) + return false; - return false; + return getTileDefNum(MRI, MO.getReg()) > 0; } static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) { @@ -424,8 +436,7 @@ void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB, static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) { MachineOperand &MO = MI.getOperand(0); - if (MO.isReg() && MO.getReg().isVirtual() && - MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) + if (MO.isReg() && MO.getReg().isVirtual() && isTileRegister(MRI, MO.getReg())) return true; return false; } @@ -524,8 +535,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); - if (Reg.isVirtual() && - MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + if (Reg.isVirtual() && isTileRegister(MRI, Reg)) return true; } return false; @@ -617,6 +627,19 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { else if (dominates(MBB, LastShapeMI, ColMI)) LastShapeMI = ColMI; } + unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg()); + if (TileDefNum > 1) { + for (unsigned I = 1; I < TileDefNum; I++) { + MachineOperand *ColxMO = &MI.getOperand(2 + I); + MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg()); + if (ColxMI->getParent() == &MBB) { + if (!LastShapeMI) + LastShapeMI = ColxMI; + else if (dominates(MBB, LastShapeMI, ColxMI)) + LastShapeMI = ColxMI; + } + } + } // If there is user live out of the tilecfg, spill it and reload in // before the user. Register TileReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 70bc11228be6a..72264dd6a5c38 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -80,28 +80,41 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE, INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE, "Fast Tile Register Configure", false, false) -static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) { +static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) { // There is no phi instruction after register allocation. assert(MI.isPHI() == false); // The instruction must have 3 operands: tile def, row, col. // It should be AMX pseudo instruction that have shape operand. if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 || !MI.isPseudo()) - return false; + return 0; MachineOperand &MO = MI.getOperand(0); if (MO.isReg()) { Register Reg = MO.getReg(); - // FIXME it may be used after Greedy RA and the physical + // FIXME: It may be used after Greedy RA and the physical // register is not rewritten yet. - if (Reg.isVirtual() && - MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) - return true; + if (Reg.isVirtual()) { + if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) + return 1; + if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID) + return 2; + } if (Reg >= X86::TMM0 && Reg <= X86::TMM7) - return true; + return 1; + if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) + return 2; } - return false; + return 0; +} + +static unsigned getTMMIndex(Register Reg) { + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return Reg - X86::TMM0; + if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) + return (Reg - X86::TMM0_TMM1) * 2; + llvm_unreachable("Invalid Tmm Reg!"); } // PreTileConfig should configure the tile registers based on basic @@ -110,14 +123,17 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) { bool Change = false; SmallVector, 6> ShapeInfos; for (MachineInstr &MI : reverse(MBB)) { - if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV) + unsigned DefNum = getNumDefTiles(MRI, MI); + if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV) continue; // AMX instructions that define tile register. if (MI.getOpcode() != X86::PLDTILECFGV) { MachineOperand &Row = MI.getOperand(1); - MachineOperand &Col = MI.getOperand(2); - unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0; - ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)}); + unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg()); + for (unsigned I = 0; I < DefNum; I++) { + MachineOperand &Col = MI.getOperand(2 + I); + ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)}); + } } else { // PLDTILECFGV // Rewrite the shape information to memory. Stack slot should have // been initialized to zero in pre config. diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 70e4c199190d6..aea86c280e2f9 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -323,6 +323,35 @@ namespace { Segment = CurDAG->getRegister(0, MVT::i16); } + // Utility function to determine whether it is AMX SDNode right after + // lowering but before ISEL. + bool isAMXSDNode(SDNode *N) const { + // Check if N is AMX SDNode: + // 1. check specific opcode since these carry MVT::Untyped instead of + // x86amx_type; + // 2. check result type; + // 3. check operand type; + switch (N->getOpcode()) { + default: + break; + case X86::PT2RPNTLVWZ0V: + case X86::PT2RPNTLVWZ0T1V: + case X86::PT2RPNTLVWZ1V: + case X86::PT2RPNTLVWZ1T1V: + return true; + } + for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) { + if (N->getValueType(Idx) == MVT::x86amx) + return true; + } + for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) { + SDValue Op = N->getOperand(Idx); + if (Op.getValueType() == MVT::x86amx) + return true; + } + return false; + } + // Utility function to determine whether we should avoid selecting // immediate forms of instructions for better code size or not. // At a high level, we'd like to avoid such instructions when @@ -5278,6 +5307,47 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_t2rpntlvwz0: + case Intrinsic::x86_t2rpntlvwz0t1: + case Intrinsic::x86_t2rpntlvwz1: + case Intrinsic::x86_t2rpntlvwz1t1: { + if (!Subtarget->hasAMXTRANSPOSE()) + break; + auto *MFI = + CurDAG->getMachineFunction().getInfo(); + MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); + unsigned Opc; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_t2rpntlvwz0: + Opc = X86::PT2RPNTLVWZ0; + break; + case Intrinsic::x86_t2rpntlvwz0t1: + Opc = X86::PT2RPNTLVWZ0T1; + break; + case Intrinsic::x86_t2rpntlvwz1: + Opc = X86::PT2RPNTLVWZ1; + break; + case Intrinsic::x86_t2rpntlvwz1t1: + Opc = X86::PT2RPNTLVWZ1T1; + break; + } + // FIXME: Match displacement and scale. + unsigned TIndex = Node->getConstantOperandVal(2); + SDValue TReg = getI8Imm(TIndex, dl); + SDValue Base = Node->getOperand(3); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(4); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + ReplaceNode(Node, CNode); + return; + } } break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 34bc5d76c15ce..0ae814d0ca3bb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27291,6 +27291,53 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } + case Intrinsic::x86_t2rpntlvwz0_internal: + case Intrinsic::x86_t2rpntlvwz0t1_internal: + case Intrinsic::x86_t2rpntlvwz1_internal: + case Intrinsic::x86_t2rpntlvwz1t1_internal: { + if (!Subtarget.hasAMXTILE()) + break; + auto *X86MFI = DAG.getMachineFunction().getInfo(); + X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); + unsigned IntNo = Op.getConstantOperandVal(1); + unsigned Opc = 0; + switch (IntNo) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_t2rpntlvwz0_internal: + Opc = X86::PT2RPNTLVWZ0V; + break; + case Intrinsic::x86_t2rpntlvwz0t1_internal: + Opc = X86::PT2RPNTLVWZ0T1V; + break; + case Intrinsic::x86_t2rpntlvwz1_internal: + Opc = X86::PT2RPNTLVWZ1V; + break; + case Intrinsic::x86_t2rpntlvwz1t1_internal: + Opc = X86::PT2RPNTLVWZ1T1V; + break; + } + + SDLoc DL(Op); + SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); + + SDValue Ops[] = {Op.getOperand(2), // Row + Op.getOperand(3), // Col0 + Op.getOperand(4), // Col1 + Op.getOperand(5), // Base + DAG.getTargetConstant(1, DL, MVT::i8), // Scale + Op.getOperand(6), // Index + DAG.getTargetConstant(0, DL, MVT::i32), // Disp + DAG.getRegister(0, MVT::i16), // Segment + Op.getOperand(0)}; // Chain + + MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops); + SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx, + SDValue(Res, 0)); + SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx, + SDValue(Res, 0)); + return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL); + } case Intrinsic::x86_atomic_bts_rm: case Intrinsic::x86_atomic_btc_rm: case Intrinsic::x86_atomic_btr_rm: { @@ -37039,6 +37086,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, assert (Imm < 8 && "Illegal tmm index"); return X86::TMM0 + Imm; }; + auto TMMImmToTMMPair = [](unsigned Imm) { + assert(Imm < 8 && "Illegal tmm pair index."); + return X86::TMM0_TMM1 + Imm / 2; + }; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TLS_addr32: @@ -37420,7 +37471,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBUSD: case X86::PTDPBUUD: case X86::PTDPBF16PS: - case X86::PTDPFP16PS: { + case X86::PTDPFP16PS: + case X86::PTDPBF8PS: + case X86::PTDPBHF8PS: + case X86::PTDPHBF8PS: + case X86::PTDPHF8PS: { unsigned Opc; switch (MI.getOpcode()) { // clang-format off @@ -37431,6 +37486,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break; + case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break; + case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break; + case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break; + case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break; // clang-format on } @@ -37513,6 +37572,49 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PT2RPNTLVWZ0: + case X86::PT2RPNTLVWZ0T1: + case X86::PT2RPNTLVWZ1: + case X86::PT2RPNTLVWZ1T1: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::PT2RPNTLVWZ0: + Opc = X86::T2RPNTLVWZ0; + break; + case X86::PT2RPNTLVWZ0T1: + Opc = X86::T2RPNTLVWZ0T1; + break; + case X86::PT2RPNTLVWZ1: + Opc = X86::T2RPNTLVWZ1; + break; + case X86::PT2RPNTLVWZ1T1: + Opc = X86::T2RPNTLVWZ1T1; + break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); + + MIB.add(MI.getOperand(1)); // base + MIB.add(MI.getOperand(2)); // scale + MIB.add(MI.getOperand(3)); // index + MIB.add(MI.getOperand(4)); // displacement + MIB.add(MI.getOperand(5)); // segment + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::PTTRANSPOSED: { + const DebugLoc &DL = MI.getDebugLoc(); + + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::TTRANSPOSED)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } } } @@ -49321,7 +49423,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode())) // N0 is all ones or undef. We guarantee that the bits shifted into the // result are all ones, not undef. - return DAG.getConstant(-1, SDLoc(N), VT); + return DAG.getAllOnesConstant(SDLoc(N), VT); auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) { unsigned NewShiftVal = Amt0 + Amt1; diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 99deacc811a17..947a8bec2890e 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -267,3 +267,105 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in { } } // SchedRW = [WriteSystem] } + +// AMX-FP8 +let Predicates = [HasAMXFP8, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Constraints = "$src1 = $dst" in { + class AMX_FP8_BASE Opcode, string Opstr> : + I, VEX, VVVV; + } + + def TDPBF8PS : AMX_FP8_BASE<0xfd, "tdpbf8ps">, T_MAP5, PS; + def TDPBHF8PS : AMX_FP8_BASE<0xfd, "tdpbhf8ps">, T_MAP5, XD; + def TDPHBF8PS : AMX_FP8_BASE<0xfd, "tdphbf8ps">, T_MAP5, XS; + def TDPHF8PS : AMX_FP8_BASE<0xfd, "tdphf8ps">, T_MAP5, PD; + + let usesCustomInserter = 1 in { + // Pseudo instructions, using immediates instead of tile registers. + // To be translated to the actual instructions in X86ISelLowering.cpp + def PTDPBF8PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbf8ps timm:$src1, timm:$src2, + timm:$src3)]>; + def PTDPBHF8PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbhf8ps timm:$src1, timm:$src2, + timm:$src3)]>; + def PTDPHBF8PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tdphbf8ps timm:$src1, timm:$src2, + timm:$src3)]>; + def PTDPHF8PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tdphf8ps timm:$src1, timm:$src2, + timm:$src3)]>; + } + } +} + +let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in { + let mayStore = 1 in + def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>; + let mayLoad = 1 in + def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>; +} + +let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { + let SchedRW = [WriteSystem] in { + def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}", + []>, VEX, WIG, T8,PS; + + def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}", + []>, VEX, T8,PS; + + def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}", + []>, VEX, T8,PD; + + def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst), + (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}", + []>, VEX, T8,PD; + + def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), + "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS; + let isPseudo = true in { + def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), + []>; + } + + def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst), + (ins GR16:$src1, GR16:$src2, TILE:$src), + [(set TILE: $dst, + (int_x86_ttransposed_internal GR16:$src1, GR16:$src2, + TILE:$src))]>; + + let usesCustomInserter = 1 in { + def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst, + sibmem:$src1), []>; + def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst, + sibmem:$src1), []>; + def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst, + sibmem:$src1), []>; + def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst, + sibmem:$src1), []>; + def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src), + [(int_x86_ttransposed timm:$dst, timm:$src)]>; + } + } +} // HasAMXTILE, HasAMXTRANSPOSE diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 38ea1f35be2b9..9b002ebd3a93b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4538,6 +4538,11 @@ static unsigned getLoadStoreRegOpcode(Register Reg, return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD) : GET_EGPR_IF_ENABLED(X86::TILESTORED); #undef GET_EGPR_IF_ENABLED + case 2048: + assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) && + "Unknown 2048-byte regclass"); + assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE"); + return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE; } } @@ -4732,6 +4737,7 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILESTORED: case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: + case X86::PTILEPAIRLOAD: return true; } } @@ -4744,7 +4750,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, default: llvm_unreachable("Unexpected special opcode!"); case X86::TILESTORED: - case X86::TILESTORED_EVEX: { + case X86::TILESTORED_EVEX: + case X86::PTILEPAIRSTORE: { // tilestored %tmm, (%sp, %idx) MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); @@ -4758,7 +4765,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, break; } case X86::TILELOADD: - case X86::TILELOADD_EVEX: { + case X86::TILELOADD_EVEX: + case X86::PTILEPAIRLOAD: { // tileloadd (%sp, %idx), %tmm MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td index f8f5cd83166e3..2102cb4b6b5b7 100644 --- a/llvm/lib/Target/X86/X86InstrOperands.td +++ b/llvm/lib/Target/X86/X86InstrOperands.td @@ -501,3 +501,10 @@ def VK8Pair : RegisterOperand { def VK16Pair : RegisterOperand { let ParserMatchClass = VK16PairAsmOperand; } + +let RenderMethod = "addTILEPairOperands" in + def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; } + +def TILEPair : RegisterOperand { + let ParserMatchClass = TILEPairAsmOperand; +} diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 7fb566fba5181..d22e7dadaaa26 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -183,6 +183,8 @@ def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">; def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; +def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; +def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">; def HasCRC32 : Predicate<"Subtarget->hasCRC32()">; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 919e1eb3e38e9..688e886cf3b13 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -74,6 +74,22 @@ static bool isAMXCast(Instruction *II) { match(II, m_Intrinsic(m_Value())); } +// Some instructions may return more than one tiles. +// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal +static unsigned getNumDefTiles(IntrinsicInst *II) { + Type *Ty = II->getType(); + if (Ty->isX86_AMXTy()) + return 1; + + unsigned Num = 0; + for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) { + Type *STy = Ty->getContainedType(i); + if (STy->isX86_AMXTy()) + Num++; + } + return Num; +} + static bool isAMXIntrinsic(Value *I) { auto *II = dyn_cast(I); if (!II) @@ -82,7 +98,7 @@ static bool isAMXIntrinsic(Value *I) { return false; // Check if return type or parameter is x86_amx. If it is x86_amx // the intrinsic must be x86 amx intrinsics. - if (II->getType()->isX86_AMXTy()) + if (getNumDefTiles(II) > 0) return true; for (Value *V : II->args()) { if (V->getType()->isX86_AMXTy()) @@ -121,12 +137,96 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) { llvm_unreachable("No terminator in the entry block!"); } -static std::pair getShape(IntrinsicInst *II, unsigned OpNo) { +class ShapeCalculator { +private: + TargetMachine *TM = nullptr; + + // In AMX intrinsics we let Shape = {Row, Col}, but the + // RealCol = Col / ElementSize. We may use the RealCol + // as a new Row for other new created AMX intrinsics. + std::map Col2Row, Row2Col; + +public: + ShapeCalculator(TargetMachine *TargetM) : TM(TargetM) {} + std::pair getShape(IntrinsicInst *II, unsigned OpNo); + std::pair getShape(PHINode *Phi); + Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); + Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity); +}; + +Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V, + unsigned Granularity) { + if (Col2Row.count(V)) + return Col2Row[V]; + IRBuilder<> Builder(II); + Value *RealRow = nullptr; + if (isa(V)) + RealRow = + Builder.getInt16((cast(V)->getSExtValue()) / Granularity); + else if (isa(V)) { + // When it is not a const value and it is not a function argument, we + // create Row after the definition of V instead of + // before II. For example, II is %118, we try to getshape for %117: + // %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x + // i32> %115). + // %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 + // %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx + // %117). + // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its + // definition is after its user(new tileload for %117). + // So, the best choice is to create %row right after the definition of + // %106. + Builder.SetInsertPoint(cast(V)); + RealRow = Builder.CreateUDiv(V, Builder.getInt16(4)); + cast(RealRow)->moveAfter(cast(V)); + } else { + // When it is not a const value and it is a function argument, we create + // Row at the entry bb. + IRBuilder<> NewBuilder( + getFirstNonAllocaInTheEntryBlock(*II->getFunction())); + RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity)); + } + Col2Row[V] = RealRow; + return RealRow; +} + +Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V, + unsigned Granularity) { + if (Row2Col.count(V)) + return Row2Col[V]; + IRBuilder<> Builder(II); + Value *RealCol = nullptr; + if (isa(V)) + RealCol = + Builder.getInt16((cast(V)->getSExtValue()) * Granularity); + else if (isa(V)) { + Builder.SetInsertPoint(cast(V)); + RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity)); + cast(RealCol)->moveAfter(cast(V)); + } else { + // When it is not a const value and it is a function argument, we create + // Row at the entry bb. + IRBuilder<> NewBuilder( + getFirstNonAllocaInTheEntryBlock(*II->getFunction())); + RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity)); + } + Row2Col[V] = RealCol; + return RealCol; +} + +// TODO: Refine the row and col-in-bytes of tile to row and col of matrix. +std::pair ShapeCalculator::getShape(IntrinsicInst *II, + unsigned OpNo) { + (void)TM; IRBuilder<> Builder(II); Value *Row = nullptr, *Col = nullptr; switch (II->getIntrinsicID()) { default: llvm_unreachable("Expect amx intrinsics"); + case Intrinsic::x86_t2rpntlvwz0_internal: + case Intrinsic::x86_t2rpntlvwz0t1_internal: + case Intrinsic::x86_t2rpntlvwz1_internal: + case Intrinsic::x86_t2rpntlvwz1t1_internal: case Intrinsic::x86_tileloadd64_internal: case Intrinsic::x86_tileloaddt164_internal: case Intrinsic::x86_tilestored64_internal: { @@ -154,43 +254,24 @@ static std::pair getShape(IntrinsicInst *II, unsigned OpNo) { Col = II->getArgOperand(2); break; case 5: - if (isa(II->getArgOperand(2))) - Row = Builder.getInt16( - (cast(II->getOperand(2))->getSExtValue()) / 4); - else if (isa(II->getArgOperand(2))) { - // When it is not a const value and it is not a function argument, we - // create Row after the definition of II->getOperand(2) instead of - // before II. For example, II is %118, we try to getshape for %117: - // %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x - // i32> %115). - // %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 - // %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx - // %117). - // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its - // definition is after its user(new tileload for %117). - // So, the best choice is to create %row right after the definition of - // %106. - Builder.SetInsertPoint(cast(II->getOperand(2))); - Row = Builder.CreateUDiv(II->getOperand(2), Builder.getInt16(4)); - cast(Row)->moveAfter(cast(II->getOperand(2))); - } else { - // When it is not a const value and it is a function argument, we create - // Row at the entry bb. - IRBuilder<> NewBuilder( - getFirstNonAllocaInTheEntryBlock(*II->getFunction())); - Row = NewBuilder.CreateUDiv(II->getOperand(2), NewBuilder.getInt16(4)); - } + Row = getRowFromCol(II, II->getArgOperand(2), 4); Col = II->getArgOperand(1); break; } break; } + case Intrinsic::x86_ttransposed_internal: { + assert((OpNo == 2) && "Illegal Operand Number."); + Row = getRowFromCol(II, II->getArgOperand(1), 4); + Col = getColFromRow(II, II->getArgOperand(0), 4); + break; + } } return std::make_pair(Row, Col); } -static std::pair getShape(PHINode *Phi) { +std::pair ShapeCalculator::getShape(PHINode *Phi) { Use &U = *(Phi->use_begin()); unsigned OpNo = U.getOperandNo(); User *V = U.getUser(); @@ -223,14 +304,15 @@ static std::pair getShape(PHINode *Phi) { namespace { class X86LowerAMXType { Function &Func; + ShapeCalculator *SC; // In AMX intrinsics we let Shape = {Row, Col}, but the // RealCol = Col / ElementSize. We may use the RealCol // as a new Row for other new created AMX intrinsics. - std::map Col2Row; + std::map Col2Row, Row2Col; public: - X86LowerAMXType(Function &F) : Func(F) {} + X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {} bool visit(); void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); @@ -247,7 +329,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { Use &U = *(Bitcast->use_begin()); unsigned OpNo = U.getOperandNo(); auto *II = cast(U.getUser()); - std::tie(Row, Col) = getShape(II, OpNo); + std::tie(Row, Col) = SC->getShape(II, OpNo); IRBuilder<> Builder(Bitcast); // Use the maximun column as stride. Value *Stride = Builder.getInt64(64); @@ -327,7 +409,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = getShape(II, OpNo); + std::tie(Row, Col) = SC->getShape(II, OpNo); std::array Args = {Row, Col, I8Ptr, Stride}; Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args); @@ -467,10 +549,18 @@ static Value *getAllocaPos(BasicBlock *BB) { static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); - auto *II = cast(TileDef); + auto *II = dyn_cast(TileDef); + unsigned Idx = 0; + // Extract tile from multiple tiles' def. + if (auto *Extr = dyn_cast(TileDef)) { + assert(Extr->hasIndices() && "Tile extract miss index!"); + Idx = Extr->getIndices()[0]; + II = cast(Extr->getOperand(0)); + } + assert(II && "Not tile intrinsic!"); - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); + Value *Row = II->getOperand(Idx); + Value *Col = II->getOperand(Idx + 1); BasicBlock *BB = TileDef->getParent(); BasicBlock::iterator Iter = TileDef->getIterator(); @@ -489,14 +579,20 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { // Get tile shape. IntrinsicInst *II = nullptr; + unsigned Idx = 0; if (IsPHI) { Value *PhiOp = cast(V)->getIncomingValue(0); II = cast(PhiOp); + } else if (auto *Extr = dyn_cast(V)) { + // Extract tile from multiple tiles' def. + assert(Extr->hasIndices() && "Tile extract miss index!"); + Idx = Extr->getIndices()[0]; + II = cast(Extr->getOperand(0)); } else { II = cast(V); } - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); + Value *Row = II->getOperand(Idx); + Value *Col = II->getOperand(Idx + 1); Instruction *UserI = cast(U.getUser()); IRBuilder<> Builder(UserI); @@ -707,10 +803,12 @@ namespace { class X86LowerAMXCast { Function &Func; + ShapeCalculator *SC; std::unique_ptr DT; public: - X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {} + X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC) + : Func(F), SC(ShapeC), DT(nullptr) {} bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST); bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD); bool combineLdSt(SmallVectorImpl &Casts); @@ -788,7 +886,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( if (!isa(IncValue) && !IncConst->isZeroValue()) return false; Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = getShape(OldPN); + std::tie(Row, Col) = SC->getShape(OldPN); // TODO: If it is not constant the Row and Col must domoniate tilezero // that we are going to create. if (!Row || !Col || !isa(Row) || !isa(Col)) @@ -919,6 +1017,19 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( return true; } +static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx, + bool IsRow) { + if (!isAMXIntrinsic(Inst)) + return nullptr; + + auto *II = cast(Inst); + if (IsRow) + return II->getOperand(0); + + assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!"); + return II->getOperand(ShapeIdx + 1); +} + // %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42) // store <256 x i32> %43, <256 x i32>* %p, align 64 // --> @@ -926,16 +1037,46 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi( // i64 64, x86_amx %42) bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { Value *Tile = Cast->getOperand(0); - // TODO: If it is cast intrinsic or phi node, we can propagate the - // shape information through def-use chain. - if (!isAMXIntrinsic(Tile)) + + assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!"); + + // TODO: Specially handle the multi-use case. + if (Tile->getNumUses() != 1) return false; - auto *II = cast(Tile); - // Tile is output from AMX intrinsic. The first operand of the - // intrinsic is row, the second operand of the intrinsic is column. - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); + + // We don't fetch shape from tilestore, we only get shape from tiledef, + // so we can set the max tile shape to tilestore for special cases. IRBuilder<> Builder(ST); + Value *Row = nullptr; + Value *Col = nullptr; + + if (isAMXIntrinsic(Tile)) { + auto *II = cast(Tile); + // Tile is output from AMX intrinsic. The first operand of the + // intrinsic is row, the second operand of the intrinsic is column. + Row = II->getOperand(0); + Col = II->getOperand(1); + } else { + // Now we supported multi-tiles value in structure, so we may get tile + // from extracting multi-tiles structure. + // For example: + // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1, + // i16 %2, i16 %3, i8* %4, i64 %5) + // %7 = extractvalue { x86_amx, x86_amx } %6, 0 + // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7) + // store <256 x i32> %8, <256 x i32>* %0, align 1024 + // + // TODO: Currently we only handle extractvalue case, enhance me for other + // cases if possible. + auto *II = cast(Tile); + assert(II && "We meet unhandle source in fetching tile value!"); + unsigned ShapeIdx = II->getIndices()[0]; + Value *Tiles = II->getOperand(0); + Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true); + Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false); + } + assert(Row && Col && "Shape got failed!"); + // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); Value *I8Ptr = Builder.CreateBitCast(ST->getOperand(1), Builder.getPtrTy()); @@ -959,7 +1100,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) { // shape information through def-use chain. if (!isAMXIntrinsic(II)) return false; - std::tie(Row, Col) = getShape(II, OpNo); + std::tie(Row, Col) = SC->getShape(II, OpNo); IRBuilder<> Builder(LD); // Stride should be equal to col(measured by bytes) Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); @@ -1169,7 +1310,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = getShape(II, OpNo); + std::tie(Row, Col) = SC->getShape(II, OpNo); std::array Args = { Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())}; Value *NewInst = @@ -1245,13 +1386,14 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass { TargetLibraryInfo *TLI = &getAnalysis().getTLI(F); - X86LowerAMXCast LAC(F); + ShapeCalculator SC(TM); + X86LowerAMXCast LAC(F, &SC); C |= LAC.combineAMXcast(TLI); // There might be remaining AMXcast after combineAMXcast and they should be // handled elegantly. C |= LAC.transformAllAMXCast(); - X86LowerAMXType LAT(F); + X86LowerAMXType LAT(F, &SC); C |= LAT.visit(); // Prepare for fast register allocation at O0. diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 1d1885a3dcd24..d20bfdcdb7f9c 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -118,16 +118,27 @@ class X86PreTileConfig : public MachineFunctionPass { bool isAMXInstruction(MachineInstr &MI) { if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3) return false; - MachineOperand &MO = MI.getOperand(0); + + // PTILESTOREDV is the only exception that doesn't def a AMX register. + if (MI.getOpcode() == X86::PTILESTOREDV) + return true; + // We can simply check if it is AMX instruction by its def. // But we should exclude old API which uses physical registers. - if (MO.isReg() && MO.getReg().isVirtual() && - MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) { - collectShapeInfo(MI); - return true; - } - // PTILESTOREDV is the only exception that doesn't def a AMX register. - return MI.getOpcode() == X86::PTILESTOREDV; + MachineOperand &MO = MI.getOperand(0); + if (!MO.isReg() || !MO.getReg().isVirtual()) + return false; + + unsigned Shapes = 0; + if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) + Shapes = 1; + if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID) + Shapes = 2; + if (!Shapes) + return false; + + collectShapeInfo(MI, Shapes); + return true; } /// Check if it is an edge from loop bottom to loop head. @@ -142,7 +153,7 @@ class X86PreTileConfig : public MachineFunctionPass { } /// Collect the shape def information for later use. - void collectShapeInfo(MachineInstr &MI); + void collectShapeInfo(MachineInstr &MI, unsigned Shapes); /// Try to hoist shapes definded below AMX instructions. bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl &Shapes) { @@ -208,7 +219,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Pre-configure", false, false) -void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { +void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) { auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) { MIRef MIR(MI, MBB); auto I = llvm::lower_bound(ShapeBBs[MBB], MIR); @@ -216,8 +227,10 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { ShapeBBs[MBB].insert(I, MIR); }; - SmallVector WorkList( - {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()}); + // All shapes have same row in multi-tile operand. + SmallVector WorkList; + for (unsigned I = 1; I < Shapes + 2; ++I) + WorkList.push_back(MI.getOperand(I).getReg()); while (!WorkList.empty()) { Register R = WorkList.pop_back_val(); MachineInstr *DefMI = MRI->getVRegDef(R); @@ -225,6 +238,14 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) { MachineBasicBlock *DefMBB = DefMI->getParent(); if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second) continue; + + // This happens when column = 0 in multi-tile operand. + if (DefMI->getOpcode() == X86::COPY) { + MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg()); + if (MI && MI->isMoveImmediate()) + continue; + } + if (DefMI->isPHI()) { for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2) if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB())) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 302d50581e1e6..2daaa95b06be0 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -642,6 +642,10 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } + // Reserve low half pair registers in case they are used by RA aggressively. + Reserved.set(X86::TMM0_TMM1); + Reserved.set(X86::TMM2_TMM3); + assert(checkAllSuperRegsMarked(Reserved, {X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::SIH, X86::DIH, X86::BPH, X86::SPH})); @@ -662,7 +666,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const { // and try to return the minimum number of registers supported by the target. static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) && (X86::K6_K7 + 1 == X86::TMMCFG) && - (X86::TMM7 + 1 == X86::R16) && + (X86::TMM6_TMM7 + 1 == X86::R16) && (X86::R31WH + 1 == X86::NUM_TARGET_REGS), "Register number may be incorrect"); @@ -735,7 +739,8 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF, } bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const { - return RC->getID() == X86::TILERegClassID; + return RC->getID() == X86::TILERegClassID || + RC->getID() == X86::TILEPAIRRegClassID; } void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { @@ -1073,12 +1078,59 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTDPFP16PSV: case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: + case X86::PTTRANSPOSEDV: { MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); VRM->assignVirt2Shape(VirtReg, Shape); return Shape; } + case X86::PT2RPNTLVWZ0V: + case X86::PT2RPNTLVWZ0T1V: + case X86::PT2RPNTLVWZ1V: + case X86::PT2RPNTLVWZ1T1V: { + MachineOperand &MO1 = MI->getOperand(1); + MachineOperand &MO2 = MI->getOperand(2); + MachineOperand &MO3 = MI->getOperand(3); + ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI); + VRM->assignVirt2Shape(VirtReg, Shape); + return Shape; + } + } +} + +static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) { + unsigned PhysShapeNum = PhysShape.getShapeNum(); + unsigned VirtShapeNum = VirtShape.getShapeNum(); + + if (PhysShapeNum < VirtShapeNum) + return false; + + if (PhysShapeNum == VirtShapeNum) { + if (PhysShapeNum == 1) + return PhysShape == VirtShape; + + for (unsigned I = 0; I < PhysShapeNum; I++) { + ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); + ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I)); + if (VShape != PShape) + return false; + } + return true; + } + + // Hint subreg of mult-tile reg to single tile reg. + if (VirtShapeNum == 1) { + for (unsigned I = 0; I < PhysShapeNum; I++) { + ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I)); + if (VirtShape == PShape) + return true; + } + } + + // Note: Currently we have no requirement for case of + // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum) + return false; } bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, @@ -1099,7 +1151,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, if (!VRM) return BaseImplRetVal; - if (ID != X86::TILERegClassID) { + if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) { if (DisableRegAllocNDDHints || !ST.hasNDD() || !TRI.isGeneralPurposeRegisterClass(&RC)) return BaseImplRetVal; @@ -1151,7 +1203,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, return; } ShapeT PhysShape = getTileShape(VReg, const_cast(VRM), MRI); - if (PhysShape == VirtShape) + if (canHintShape(PhysShape, VirtShape)) Hints.push_back(PhysReg); }; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 166024bf3b53f..19a0b37d06a2a 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -30,6 +30,8 @@ let Namespace = "X86" in { def sub_ymm : SubRegIndex<256>; def sub_mask_0 : SubRegIndex<-1>; def sub_mask_1 : SubRegIndex<-1, -1>; + def sub_t0 : SubRegIndex<8192>; + def sub_t1 : SubRegIndex<8192, 8192>; } //===----------------------------------------------------------------------===// @@ -431,6 +433,10 @@ def TMM5: X86Reg<"tmm5", 5>; def TMM6: X86Reg<"tmm6", 6>; def TMM7: X86Reg<"tmm7", 7>; } +// TMM register pairs +def TPAIRS : RegisterTuples<[sub_t0, sub_t1], + [(add TMM0, TMM2, TMM4, TMM6), + (add TMM1, TMM3, TMM5, TMM7)]>; // Floating point stack registers. These don't map one-to-one to the FP // pseudo registers, but we still mark them as aliasing FP registers. That @@ -835,6 +841,9 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} +// Need check alignment 3rd operand size=1024*2*8 +let isAllocatable = 1 in +def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;} //===----------------------------------------------------------------------===// // Register categories. diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index 2250c3912a90d..95a84c2cda536 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -76,6 +76,63 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false, false) +unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) { + if (Reg.isVirtual()) { + unsigned RegClassID = MRI->getRegClass(Reg)->getID(); + if (RegClassID == X86::TILERegClassID) + return 1; + if (RegClassID == X86::TILEPAIRRegClassID) + return 2; + } else { + if (Reg >= X86::TMM0 && Reg <= X86::TMM7) + return 1; + if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7) + return 2; + } + return 0; +} + +static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM, + Register VirtReg, + SmallVector &Phys2Shapes) { + unsigned Num = getAMXRegNum(MRI, VirtReg); + MCRegister PhysReg = VRM.getPhys(VirtReg); + if (!PhysReg) + return; + + if (Num == 1) { + unsigned Index = PhysReg - X86::TMM0; + if (!Phys2Shapes[Index].isValid()) { + ShapeT Shape = VRM.getShape(VirtReg); + Phys2Shapes[Index] = Shape; + return; + } + } + // Split tile pair shape info to 2 single tile shape info. e.g: + // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes. + if (Num == 2) { + unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2; + unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1; + + ShapeT Shape = VRM.getShape(VirtReg); + assert(Shape.getShapeNum() == 2 && "Unexpected shape number!"); + + if (!Phys2Shapes[Index0].isValid()) { + ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI); + Phys2Shapes[Index0] = Shape0; + } + + if (!Phys2Shapes[Index1].isValid()) { + ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI); + Phys2Shapes[Index1] = Shape1; + } + } +} + +static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) { + return getAMXRegNum(MRI, Reg) > 0; +} + bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { X86MachineFunctionInfo *X86FI = MF.getInfo(); // Early exit in the common case of non-AMX code. @@ -121,29 +178,24 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { assert(ConstMI && "Cannot find an insertion point"); unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs(); - SmallVector Phys2Virt(AMXRegNum, 0); + SmallVector Phys2Shapes(AMXRegNum, ShapeT()); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { Register VirtReg = Register::index2VirtReg(I); if (MRI.reg_nodbg_empty(VirtReg)) continue; - if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID) - continue; - MCRegister PhysReg = VRM.getPhys(VirtReg); - if (!PhysReg) + if (!isAMXRegClass(&MRI, VirtReg)) continue; - unsigned Index = PhysReg - X86::TMM0; - if (!Phys2Virt[Index]) - Phys2Virt[Index] = VirtReg; + collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes); } // Fill in the shape of each tile physical register. for (unsigned I = 0; I < AMXRegNum; ++I) { - if (!Phys2Virt[I]) + ShapeT Shape = Phys2Shapes[I]; + if (!Shape.isValid()) continue; DebugLoc DL; bool IsRow = true; MachineInstr *NewMI = nullptr; - ShapeT Shape = VRM.getShape(Phys2Virt[I]); for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) { // Here is the data format for the tile config. // 0 palette @@ -172,7 +224,15 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) { "Cannot initialize with different shapes"); continue; } - Imm = DefMI.getOperand(1).getImm(); + if (DefMI.getOperand(1).isImm()) { + Imm = DefMI.getOperand(1).getImm(); + } else { + assert(DefMI.getOpcode() == X86::MOV32r0 && + "The opcode is assumed to be MOV32r0 if the operand is not " + "immediate."); + Imm = 0; + } + NewMI = addFrameReference( BuildMI(MF.front(), ++ConstMI->getIterator(), DL, TII->get(IsRow ? X86::MOV8mi : X86::MOV16mi)), diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 5c4e3a9dc52b0..93911bc51a207 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1876,6 +1876,11 @@ const StringMap sys::getHostCPUFeatures() { MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX); Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1); + bool HasLeaf1E = MaxLevel >= 0x1e && + !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); + Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; + Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; + bool HasLeaf24 = MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 586df5748aa82..691809b6d4b5a 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -598,6 +598,8 @@ constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 0395ee62ae988..070df429bfc26 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -60,6 +60,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/TimeProfiler.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Coroutines/ABI.h" #include "llvm/Transforms/Coroutines/CoroInstr.h" @@ -118,7 +119,6 @@ class CoroCloner { TargetTransformInfo &TTI; -public: /// Create a cloner for a switch lowering. CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape, Kind FKind, TargetTransformInfo &TTI) @@ -140,6 +140,30 @@ class CoroCloner { assert(ActiveSuspend && "need active suspend point for continuation"); } +public: + /// Create a clone for a switch lowering. + static Function *createClone(Function &OrigF, const Twine &Suffix, + coro::Shape &Shape, Kind FKind, + TargetTransformInfo &TTI) { + TimeTraceScope FunctionScope("CoroCloner"); + + CoroCloner Cloner(OrigF, Suffix, Shape, FKind, TTI); + Cloner.create(); + return Cloner.getFunction(); + } + + /// Create a clone for a continuation lowering. + static Function *createClone(Function &OrigF, const Twine &Suffix, + coro::Shape &Shape, Function *NewF, + AnyCoroSuspendInst *ActiveSuspend, + TargetTransformInfo &TTI) { + TimeTraceScope FunctionScope("CoroCloner"); + + CoroCloner Cloner(OrigF, Suffix, Shape, NewF, ActiveSuspend, TTI); + Cloner.create(); + return Cloner.getFunction(); + } + Function *getFunction() const { assert(NewF != nullptr && "declaration not yet set"); return NewF; @@ -1466,13 +1490,16 @@ struct SwitchCoroutineSplitter { TargetTransformInfo &TTI) { assert(Shape.ABI == coro::ABI::Switch); + // Create a resume clone by cloning the body of the original function, + // setting new entry block and replacing coro.suspend an appropriate value + // to force resume or cleanup pass for every suspend point. createResumeEntryBlock(F, Shape); - auto *ResumeClone = - createClone(F, ".resume", Shape, CoroCloner::Kind::SwitchResume, TTI); - auto *DestroyClone = - createClone(F, ".destroy", Shape, CoroCloner::Kind::SwitchUnwind, TTI); - auto *CleanupClone = - createClone(F, ".cleanup", Shape, CoroCloner::Kind::SwitchCleanup, TTI); + auto *ResumeClone = CoroCloner::createClone( + F, ".resume", Shape, CoroCloner::Kind::SwitchResume, TTI); + auto *DestroyClone = CoroCloner::createClone( + F, ".destroy", Shape, CoroCloner::Kind::SwitchUnwind, TTI); + auto *CleanupClone = CoroCloner::createClone( + F, ".cleanup", Shape, CoroCloner::Kind::SwitchCleanup, TTI); postSplitCleanup(*ResumeClone); postSplitCleanup(*DestroyClone); @@ -1562,17 +1589,6 @@ struct SwitchCoroutineSplitter { } private: - // Create a resume clone by cloning the body of the original function, setting - // new entry block and replacing coro.suspend an appropriate value to force - // resume or cleanup pass for every suspend point. - static Function *createClone(Function &F, const Twine &Suffix, - coro::Shape &Shape, CoroCloner::Kind FKind, - TargetTransformInfo &TTI) { - CoroCloner Cloner(F, Suffix, Shape, FKind, TTI); - Cloner.create(); - return Cloner.getFunction(); - } - // Create an entry block for a resume function with a switch that will jump to // suspend points. static void createResumeEntryBlock(Function &F, coro::Shape &Shape) { @@ -1872,7 +1888,8 @@ void coro::AsyncABI::splitCoroutine(Function &F, coro::Shape &Shape, auto *Suspend = Shape.CoroSuspends[Idx]; auto *Clone = Clones[Idx]; - CoroCloner(F, "resume." + Twine(Idx), Shape, Clone, Suspend, TTI).create(); + CoroCloner::createClone(F, "resume." + Twine(Idx), Shape, Clone, Suspend, + TTI); } } @@ -2001,7 +2018,8 @@ void coro::AnyRetconABI::splitCoroutine(Function &F, coro::Shape &Shape, auto Suspend = Shape.CoroSuspends[i]; auto Clone = Clones[i]; - CoroCloner(F, "resume." + Twine(i), Shape, Clone, Suspend, TTI).create(); + CoroCloner::createClone(F, "resume." + Twine(i), Shape, Clone, Suspend, + TTI); } } diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 3fcfc6a876776..70f83892c3739 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -118,10 +118,16 @@ static cl::opt ClWriteSummary( cl::desc("Write summary to given YAML file after running pass"), cl::Hidden); -static cl::opt +static cl::opt ClDropTypeTests("lowertypetests-drop-type-tests", - cl::desc("Simply drop type test assume sequences"), - cl::Hidden, cl::init(false)); + cl::desc("Simply drop type test sequences"), + cl::values(clEnumValN(DropTestKind::None, "none", + "Do not drop any type tests"), + clEnumValN(DropTestKind::Assume, "assume", + "Drop type test assume sequences"), + clEnumValN(DropTestKind::All, "all", + "Drop all type test sequences")), + cl::Hidden, cl::init(DropTestKind::None)); bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const { if (Offset < ByteOffset) @@ -399,7 +405,7 @@ class LowerTypeTestsModule { const ModuleSummaryIndex *ImportSummary; // Set when the client has invoked this to simply drop all type test assume // sequences. - bool DropTypeTests; + DropTestKind DropTypeTests; Triple::ArchType Arch; Triple::OSType OS; @@ -542,7 +548,7 @@ class LowerTypeTestsModule { LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary, - bool DropTypeTests); + DropTestKind DropTypeTests); bool lower(); @@ -1828,9 +1834,10 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet( /// Lower all type tests in this module. LowerTypeTestsModule::LowerTypeTestsModule( Module &M, ModuleAnalysisManager &AM, ModuleSummaryIndex *ExportSummary, - const ModuleSummaryIndex *ImportSummary, bool DropTypeTests) + const ModuleSummaryIndex *ImportSummary, DropTestKind DropTypeTests) : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary), - DropTypeTests(DropTypeTests || ClDropTypeTests) { + DropTypeTests(ClDropTypeTests > DropTypeTests ? ClDropTypeTests + : DropTypeTests) { assert(!(ExportSummary && ImportSummary)); Triple TargetTriple(M.getTargetTriple()); Arch = TargetTriple.getArch(); @@ -1882,7 +1889,7 @@ bool LowerTypeTestsModule::runForTesting(Module &M, ModuleAnalysisManager &AM) { M, AM, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr, ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr, - /*DropTypeTests*/ false) + /*DropTypeTests=*/DropTestKind::None) .lower(); if (!ClWriteSummary.empty()) { @@ -1949,7 +1956,8 @@ void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) { Old->replaceUsesWithIf(New, isDirectCall); } -static void dropTypeTests(Module &M, Function &TypeTestFunc) { +static void dropTypeTests(Module &M, Function &TypeTestFunc, + bool ShouldDropAll) { for (Use &U : llvm::make_early_inc_range(TypeTestFunc.uses())) { auto *CI = cast(U.getUser()); // Find and erase llvm.assume intrinsics for this llvm.type.test call. @@ -1959,9 +1967,13 @@ static void dropTypeTests(Module &M, Function &TypeTestFunc) { // If the assume was merged with another assume, we might have a use on a // phi (which will feed the assume). Simply replace the use on the phi // with "true" and leave the merged assume. + // + // If ShouldDropAll is set, then we we need to update any remaining uses, + // regardless of the instruction type. if (!CI->use_empty()) { - assert( - all_of(CI->users(), [](User *U) -> bool { return isa(U); })); + assert(ShouldDropAll || all_of(CI->users(), [](User *U) -> bool { + return isa(U); + })); CI->replaceAllUsesWith(ConstantInt::getTrue(M.getContext())); } CI->eraseFromParent(); @@ -1972,16 +1984,17 @@ bool LowerTypeTestsModule::lower() { Function *TypeTestFunc = Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test); - if (DropTypeTests) { + if (DropTypeTests != DropTestKind::None) { + bool ShouldDropAll = DropTypeTests == DropTestKind::All; if (TypeTestFunc) - dropTypeTests(M, *TypeTestFunc); + dropTypeTests(M, *TypeTestFunc, ShouldDropAll); // Normally we'd have already removed all @llvm.public.type.test calls, // except for in the case where we originally were performing ThinLTO but // decided not to in the backend. Function *PublicTypeTestFunc = Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test); if (PublicTypeTestFunc) - dropTypeTests(M, *PublicTypeTestFunc); + dropTypeTests(M, *PublicTypeTestFunc, ShouldDropAll); if (TypeTestFunc || PublicTypeTestFunc) { // We have deleted the type intrinsics, so we no longer have enough // information to reason about the liveness of virtual function pointers @@ -2083,8 +2096,12 @@ bool LowerTypeTestsModule::lower() { for (auto &I : *ExportSummary) for (auto &GVS : I.second.SummaryList) if (GVS->isLive()) - for (const auto &Ref : GVS->refs()) + for (const auto &Ref : GVS->refs()) { AddressTaken.insert(Ref.getGUID()); + for (auto &RefGVS : Ref.getSummaryList()) + if (auto Alias = dyn_cast(RefGVS.get())) + AddressTaken.insert(Alias->getAliaseeGUID()); + } NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"); if (CfiFunctionsMD) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 7a060cdab2d37..adbd9186c59c5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -102,6 +102,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final Instruction *visitSRem(BinaryOperator &I); Instruction *visitFRem(BinaryOperator &I); bool simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I); + Instruction *commonIDivRemTransforms(BinaryOperator &I); Instruction *commonIRemTransforms(BinaryOperator &I); Instruction *commonIDivTransforms(BinaryOperator &I); Instruction *visitUDiv(BinaryOperator &I); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index b9c165da906da..f85a3c9365135 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -1158,29 +1158,39 @@ static Value *foldIDivShl(BinaryOperator &I, InstCombiner::BuilderTy &Builder) { return nullptr; } -/// This function implements the transforms common to both integer division -/// instructions (udiv and sdiv). It is called by the visitors to those integer -/// division instructions. -/// Common integer divide transforms -Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) { - if (Instruction *Phi = foldBinopWithPhiOperands(I)) - return Phi; - +/// Common integer divide/remainder transforms +Instruction *InstCombinerImpl::commonIDivRemTransforms(BinaryOperator &I) { + assert(I.isIntDivRem() && "Unexpected instruction"); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - bool IsSigned = I.getOpcode() == Instruction::SDiv; + + // If any element of a constant divisor fixed width vector is zero or undef + // the behavior is undefined and we can fold the whole op to poison. + auto *Op1C = dyn_cast(Op1); Type *Ty = I.getType(); + auto *VTy = dyn_cast(Ty); + if (Op1C && VTy) { + unsigned NumElts = VTy->getNumElements(); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *Elt = Op1C->getAggregateElement(i); + if (Elt && (Elt->isNullValue() || isa(Elt))) + return replaceInstUsesWith(I, PoisonValue::get(Ty)); + } + } + + if (Instruction *Phi = foldBinopWithPhiOperands(I)) + return Phi; // The RHS is known non-zero. if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) return replaceOperand(I, 1, V); - // Handle cases involving: [su]div X, (select Cond, Y, Z) - // This does not apply for fdiv. + // Handle cases involving: div/rem X, (select Cond, Y, Z) if (simplifyDivRemOfSelectWithZeroOp(I)) return &I; // If the divisor is a select-of-constants, try to constant fold all div ops: - // C / (select Cond, TrueC, FalseC) --> select Cond, (C / TrueC), (C / FalseC) + // C div/rem (select Cond, TrueC, FalseC) --> select Cond, (C div/rem TrueC), + // (C div/rem FalseC) // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds. if (match(Op0, m_ImmConstant()) && match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) { @@ -1189,6 +1199,21 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) { return R; } + return nullptr; +} + +/// This function implements the transforms common to both integer division +/// instructions (udiv and sdiv). It is called by the visitors to those integer +/// division instructions. +/// Common integer divide transforms +Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) { + if (Instruction *Res = commonIDivRemTransforms(I)) + return Res; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + bool IsSigned = I.getOpcode() == Instruction::SDiv; + Type *Ty = I.getType(); + const APInt *C2; if (match(Op1, m_APInt(C2))) { Value *X; @@ -2138,29 +2163,11 @@ static Instruction *simplifyIRemMulShl(BinaryOperator &I, /// remainder instructions. /// Common integer remainder transforms Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) { - if (Instruction *Phi = foldBinopWithPhiOperands(I)) - return Phi; + if (Instruction *Res = commonIDivRemTransforms(I)) + return Res; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); - // The RHS is known non-zero. - if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) - return replaceOperand(I, 1, V); - - // Handle cases involving: rem X, (select Cond, Y, Z) - if (simplifyDivRemOfSelectWithZeroOp(I)) - return &I; - - // If the divisor is a select-of-constants, try to constant fold all rem ops: - // C % (select Cond, TrueC, FalseC) --> select Cond, (C % TrueC), (C % FalseC) - // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds. - if (match(Op0, m_ImmConstant()) && - match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) { - if (Instruction *R = FoldOpIntoSelect(I, cast(Op1), - /*FoldWithMultiUse*/ true)) - return R; - } - if (isa(Op1)) { if (Instruction *Op0I = dyn_cast(Op0)) { if (SelectInst *SI = dyn_cast(Op0I)) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index c5f39a4c381ed..999ad1adff20b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -529,9 +529,6 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal, if (!OpToFold) return nullptr; - // TODO: We probably ought to revisit cases where the select and FP - // instructions have different flags and add tests to ensure the - // behaviour is correct. FastMathFlags FMF; if (isa(&SI)) FMF = SI.getFastMathFlags(); @@ -564,6 +561,14 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal, BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel); BO->copyIRFlags(TVI); + if (isa(&SI)) { + // Merge poison generating flags from the select. + BO->setHasNoNaNs(BO->hasNoNaNs() && FMF.noNaNs()); + BO->setHasNoInfs(BO->hasNoInfs() && FMF.noInfs()); + // Merge no-signed-zeros flag from the select. + // Otherwise we may produce zeros with different sign. + BO->setHasNoSignedZeros(BO->hasNoSignedZeros() && FMF.noSignedZeros()); + } return BO; }; @@ -1051,7 +1056,7 @@ static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal, // Strictness of the comparison is irrelevant. X = Cmp0; Y = Cmp1; - if (match(FVal, m_c_Add(m_Not(m_Specific(X)), m_Specific(Y)))) { + if (match(FVal, m_c_Add(m_NotForbidPoison(m_Specific(X)), m_Specific(Y)))) { // (X u< Y) ? -1 : (~X + Y) --> uadd.sat(~X, Y) // (X u< Y) ? -1 : (Y + ~X) --> uadd.sat(Y, ~X) BinaryOperator *BO = cast(FVal); diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 2ba600497e00d..ad9b1217089d7 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2189,6 +2189,16 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) { return Changed; } +// Return true iff V1 can be replaced with V2. +static bool canBeReplacedBy(Value *V1, Value *V2) { + if (auto *CB1 = dyn_cast(V1)) + if (auto *CB2 = dyn_cast(V2)) + return CB1->getAttributes() + .intersectWith(CB2->getContext(), CB2->getAttributes()) + .has_value(); + return true; +} + static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { patchReplacementInstruction(I, Repl); I->replaceAllUsesWith(Repl); @@ -2734,7 +2744,7 @@ bool GVNPass::processInstruction(Instruction *I) { // Perform fast-path value-number based elimination of values inherited from // dominators. Value *Repl = findLeader(I->getParent(), Num); - if (!Repl) { + if (!Repl || !canBeReplacedBy(I, Repl)) { // Failure, just remember this instance for future use. LeaderTable.insert(Num, I, I->getParent()); return false; @@ -3000,7 +3010,7 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) { uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); Value *predV = findLeader(P, TValNo); - if (!predV) { + if (!predV || !canBeReplacedBy(CurInst, predV)) { predMap.push_back(std::make_pair(static_cast(nullptr), P)); PREPred = P; ++NumWithout; diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 2052fc6dadd09..05cf638d3f09d 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -2490,7 +2490,7 @@ static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, dyn_cast(CurrXPN->getIncomingValueForBlock(LoopHeaderBB)); assert(CurLoop->isLoopInvariant(BaseX) && - "Expected BaseX to be avaliable in the preheader!"); + "Expected BaseX to be available in the preheader!"); if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) { // FIXME: support right-shift? diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 13d9e8f186b47..6800ad51cc0a8 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -3854,6 +3854,16 @@ Value *NewGVN::findPHIOfOpsLeader(const Expression *E, return nullptr; } +// Return true iff V1 can be replaced with V2. +static bool canBeReplacedBy(Value *V1, Value *V2) { + if (auto *CB1 = dyn_cast(V1)) + if (auto *CB2 = dyn_cast(V2)) + return CB1->getAttributes() + .intersectWith(CB2->getContext(), CB2->getAttributes()) + .has_value(); + return true; +} + bool NewGVN::eliminateInstructions(Function &F) { // This is a non-standard eliminator. The normal way to eliminate is // to walk the dominator tree in order, keeping track of available @@ -3963,6 +3973,9 @@ bool NewGVN::eliminateInstructions(Function &F) { MembersLeft.insert(Member); continue; } + if (!canBeReplacedBy(Member, Leader)) + continue; + LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member << "\n"); auto *I = cast(Member); @@ -4069,8 +4082,11 @@ bool NewGVN::eliminateInstructions(Function &F) { if (DominatingLeader != Def) { // Even if the instruction is removed, we still need to update // flags/metadata due to downstreams users of the leader. - if (!match(DefI, m_Intrinsic())) + if (!match(DefI, m_Intrinsic())) { + if (!canBeReplacedBy(DefI, DominatingLeader)) + continue; patchReplacementInstruction(DefI, DominatingLeader); + } markInstructionForDeletion(DefI); } @@ -4112,17 +4128,21 @@ bool NewGVN::eliminateInstructions(Function &F) { // Don't replace our existing users with ourselves. if (U->get() == DominatingLeader) continue; - LLVM_DEBUG(dbgs() - << "Found replacement " << *DominatingLeader << " for " - << *U->get() << " in " << *(U->getUser()) << "\n"); // If we replaced something in an instruction, handle the patching of // metadata. Skip this if we are replacing predicateinfo with its // original operand, as we already know we can just drop it. auto *ReplacedInst = cast(U->get()); auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst); - if (!PI || DominatingLeader != PI->OriginalOp) + if (!PI || DominatingLeader != PI->OriginalOp) { + if (!canBeReplacedBy(ReplacedInst, DominatingLeader)) + continue; patchReplacementInstruction(ReplacedInst, DominatingLeader); + } + + LLVM_DEBUG(dbgs() + << "Found replacement " << *DominatingLeader << " for " + << *U->get() << " in " << *(U->getUser()) << "\n"); U->set(DominatingLeader); // This is now a use of the dominating leader, which means if the // dominating leader was dead, it's now live! diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 92e47cbc7ae8b..90c18c0b9c01c 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -8,6 +8,7 @@ #include "llvm/Transforms/Scalar/StructurizeCFG.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/STLExtras.h" @@ -325,6 +326,10 @@ class StructurizeCFG { void findUndefBlocks(BasicBlock *PHIBlock, const SmallSet &Incomings, SmallVector &UndefBlks) const; + + void mergeIfCompatible(EquivalenceClasses &PhiClasses, PHINode *A, + PHINode *B); + void setPhiValues(); void simplifyAffectedPhis(); @@ -755,10 +760,102 @@ void StructurizeCFG::findUndefBlocks( } } +// If two phi nodes have compatible incoming values (for each +// incoming block, either they have the same incoming value or only one phi +// node has an incoming value), let them share the merged incoming values. The +// merge process is guided by the equivalence information from \p PhiClasses. +// The function will possibly update the incoming values of leader phi in +// DeletedPhis. +void StructurizeCFG::mergeIfCompatible( + EquivalenceClasses &PhiClasses, PHINode *A, PHINode *B) { + auto ItA = PhiClasses.findLeader(PhiClasses.insert(A)); + auto ItB = PhiClasses.findLeader(PhiClasses.insert(B)); + // They are already in the same class, no work needed. + if (ItA == ItB) + return; + + PHINode *LeaderA = *ItA; + PHINode *LeaderB = *ItB; + BBValueVector &IncomingA = DeletedPhis[LeaderA->getParent()][LeaderA]; + BBValueVector &IncomingB = DeletedPhis[LeaderB->getParent()][LeaderB]; + + DenseMap Mergeable(IncomingA.begin(), IncomingA.end()); + for (auto [BB, V] : IncomingB) { + auto BBIt = Mergeable.find(BB); + if (BBIt != Mergeable.end() && BBIt->second != V) + return; + // Either IncomingA does not have this value or IncomingA has the same + // value. + Mergeable.insert({BB, V}); + } + + // Update the incoming value of leaderA. + IncomingA.assign(Mergeable.begin(), Mergeable.end()); + PhiClasses.unionSets(ItA, ItB); +} + /// Add the real PHI value as soon as everything is set up void StructurizeCFG::setPhiValues() { SmallVector InsertedPhis; SSAUpdater Updater(&InsertedPhis); + DenseMap> UndefBlksMap; + + // Find phi nodes that have compatible incoming values (either they have + // the same value for the same block or only one phi node has an incoming + // value, see example below). We only search again the phi's that are + // referenced by another phi, which is the case we care about. + // + // For example (-- means no incoming value): + // phi1 : BB1:phi2 BB2:v BB3:-- + // phi2: BB1:-- BB2:v BB3:w + // + // Then we can merge these incoming values and let phi1, phi2 use the + // same set of incoming values: + // + // phi1&phi2: BB1:phi2 BB2:v BB3:w + // + // By doing this, phi1 and phi2 would share more intermediate phi nodes. + // This would help reduce the number of phi nodes during SSA reconstruction + // and ultimately result in fewer COPY instructions. + // + // This should be correct, because if a phi node does not have incoming + // value from certain block, this means the block is not the predecessor + // of the parent block, so we actually don't care about its incoming value. + EquivalenceClasses PhiClasses; + for (const auto &[To, From] : AddedPhis) { + auto OldPhiIt = DeletedPhis.find(To); + if (OldPhiIt == DeletedPhis.end()) + continue; + + PhiMap &BlkPhis = OldPhiIt->second; + SmallVector &UndefBlks = UndefBlksMap[To]; + SmallSet Incomings; + + // Get the undefined blocks shared by all the phi nodes. + if (!BlkPhis.empty()) { + for (const auto &VI : BlkPhis.front().second) + Incomings.insert(VI.first); + findUndefBlocks(To, Incomings, UndefBlks); + } + + for (const auto &[Phi, Incomings] : OldPhiIt->second) { + SmallVector IncomingPHIs; + for (const auto &[BB, V] : Incomings) { + // First, for each phi, check whether it has incoming value which is + // another phi. + if (PHINode *P = dyn_cast(V)) + IncomingPHIs.push_back(P); + } + + for (auto *OtherPhi : IncomingPHIs) { + // Skip phis that are unrelated to the phi reconstruction for now. + if (!DeletedPhis.contains(OtherPhi->getParent())) + continue; + mergeIfCompatible(PhiClasses, Phi, OtherPhi); + } + } + } + for (const auto &AddedPhi : AddedPhis) { BasicBlock *To = AddedPhi.first; const BBVector &From = AddedPhi.second; @@ -766,28 +863,27 @@ void StructurizeCFG::setPhiValues() { if (!DeletedPhis.count(To)) continue; - SmallVector UndefBlks; - bool CachedUndefs = false; PhiMap &Map = DeletedPhis[To]; - for (const auto &PI : Map) { - PHINode *Phi = PI.first; + SmallVector &UndefBlks = UndefBlksMap[To]; + for (const auto &[Phi, Incoming] : Map) { Value *Undef = UndefValue::get(Phi->getType()); Updater.Initialize(Phi->getType(), ""); Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); Updater.AddAvailableValue(To, Undef); - SmallSet Incomings; - SmallVector ConstantPreds; - for (const auto &VI : PI.second) { - Incomings.insert(VI.first); - Updater.AddAvailableValue(VI.first, VI.second); - if (isa(VI.second)) - ConstantPreds.push_back(VI.first); - } + // Use leader phi's incoming if there is. + auto LeaderIt = PhiClasses.findLeader(Phi); + bool UseIncomingOfLeader = + LeaderIt != PhiClasses.member_end() && *LeaderIt != Phi; + const auto &IncomingMap = + UseIncomingOfLeader ? DeletedPhis[(*LeaderIt)->getParent()][*LeaderIt] + : Incoming; - if (!CachedUndefs) { - findUndefBlocks(To, Incomings, UndefBlks); - CachedUndefs = true; + SmallVector ConstantPreds; + for (const auto &[BB, V] : IncomingMap) { + Updater.AddAvailableValue(BB, V); + if (isa(V)) + ConstantPreds.push_back(BB); } for (auto UB : UndefBlks) { @@ -798,6 +894,10 @@ void StructurizeCFG::setPhiValues() { if (any_of(ConstantPreds, [&](BasicBlock *CP) { return DT->dominates(CP, UB); })) continue; + // Maybe already get a value through sharing with other phi nodes. + if (Updater.HasValueForBlock(UB)) + continue; + Updater.AddAvailableValue(UB, Undef); } @@ -805,10 +905,7 @@ void StructurizeCFG::setPhiValues() { Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI)); AffectedPhis.push_back(Phi); } - - DeletedPhis.erase(To); } - assert(DeletedPhis.empty()); AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end()); } diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index e1dd20478fd55..1d4f5618b39d0 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -412,7 +412,7 @@ struct AssumeSimplify { CleanupToDo.insert(Assume); if (BOI.Begin != BOI.End) { Use *U = &Assume->op_begin()[BOI.Begin + ABA_WasOn]; - U->set(UndefValue::get(U->get()->getType())); + U->set(PoisonValue::get(U->get()->getType())); } BOI.Tag = IgnoreTag; }; diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 5fd4fd78c28a9..e039457f313b2 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1215,6 +1215,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, case LibFunc_fmod: case LibFunc_fmodf: case LibFunc_fmodl: + case LibFunc_hypot: + case LibFunc_hypotf: + case LibFunc_hypotl: case LibFunc_isascii: case LibFunc_isdigit: case LibFunc_labs: diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index 5dc82a8dfb2db..a2d38717f38d1 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -87,28 +87,14 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, return NewBB; } -// Clone OldFunc into NewFunc, transforming the old arguments into references to -// VMap values. -// -void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, - ValueToValueMapTy &VMap, - CloneFunctionChangeType Changes, - SmallVectorImpl &Returns, - const char *NameSuffix, ClonedCodeInfo *CodeInfo, - ValueMapTypeRemapper *TypeMapper, - ValueMaterializer *Materializer) { - NewFunc->setIsNewDbgInfoFormat(OldFunc->IsNewDbgInfoFormat); - assert(NameSuffix && "NameSuffix cannot be null!"); - -#ifndef NDEBUG - for (const Argument &I : OldFunc->args()) - assert(VMap.count(&I) && "No mapping from source argument specified!"); -#endif - - bool ModuleLevelChanges = Changes > CloneFunctionChangeType::LocalChangesOnly; - - // Copy all attributes other than those stored in the AttributeList. We need - // to remap the parameter indices of the AttributeList. +void llvm::CloneFunctionAttributesInto(Function *NewFunc, + const Function *OldFunc, + ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + // Copy all attributes other than those stored in Function's AttributeList + // which holds e.g. parameters and return value attributes. AttributeList NewAttrs = NewFunc->getAttributes(); NewFunc->copyAttributesFrom(OldFunc); NewFunc->setAttributes(NewAttrs); @@ -140,6 +126,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, // Clone any argument attributes that are present in the VMap. for (const Argument &OldArg : OldFunc->args()) { if (Argument *NewArg = dyn_cast(VMap[&OldArg])) { + // Remap the parameter indices. NewArgAttrs[NewArg->getArgNo()] = OldAttrs.getParamAttrs(OldArg.getArgNo()); } @@ -148,6 +135,29 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, NewFunc->setAttributes( AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttrs(), OldAttrs.getRetAttrs(), NewArgAttrs)); +} + +// Clone OldFunc into NewFunc, transforming the old arguments into references to +// VMap values. +void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, + ValueToValueMapTy &VMap, + CloneFunctionChangeType Changes, + SmallVectorImpl &Returns, + const char *NameSuffix, ClonedCodeInfo *CodeInfo, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + NewFunc->setIsNewDbgInfoFormat(OldFunc->IsNewDbgInfoFormat); + assert(NameSuffix && "NameSuffix cannot be null!"); + +#ifndef NDEBUG + for (const Argument &I : OldFunc->args()) + assert(VMap.count(&I) && "No mapping from source argument specified!"); +#endif + + bool ModuleLevelChanges = Changes > CloneFunctionChangeType::LocalChangesOnly; + + CloneFunctionAttributesInto(NewFunc, OldFunc, VMap, ModuleLevelChanges, + TypeMapper, Materializer); // Everything else beyond this point deals with function instructions, // so if we are dealing with a function declaration, we're done. diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 4ad426285ce2f..a27cb4dd219c3 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -181,9 +181,21 @@ namespace { } } }; - } // end anonymous namespace +static IntrinsicInst *getConvergenceEntry(BasicBlock &BB) { + auto *I = BB.getFirstNonPHI(); + while (I) { + if (auto *IntrinsicCall = dyn_cast(I)) { + if (IntrinsicCall->isEntry()) { + return IntrinsicCall; + } + } + I = I->getNextNode(); + } + return nullptr; +} + /// Get or create a target for the branch from ResumeInsts. BasicBlock *LandingPadInliningInfo::getInnerResumeDest() { if (InnerResumeDest) return InnerResumeDest; @@ -2496,15 +2508,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // fully implements convergence control tokens, there is no mixing of // controlled and uncontrolled convergent operations in the whole program. if (CB.isConvergent()) { - auto *I = CalledFunc->getEntryBlock().getFirstNonPHI(); - if (auto *IntrinsicCall = dyn_cast(I)) { - if (IntrinsicCall->getIntrinsicID() == - Intrinsic::experimental_convergence_entry) { - if (!ConvergenceControlToken) { - return InlineResult::failure( - "convergent call needs convergencectrl operand"); - } - } + if (!ConvergenceControlToken && + getConvergenceEntry(CalledFunc->getEntryBlock())) { + return InlineResult::failure( + "convergent call needs convergencectrl operand"); } } @@ -2795,13 +2802,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, } if (ConvergenceControlToken) { - auto *I = FirstNewBlock->getFirstNonPHI(); - if (auto *IntrinsicCall = dyn_cast(I)) { - if (IntrinsicCall->getIntrinsicID() == - Intrinsic::experimental_convergence_entry) { - IntrinsicCall->replaceAllUsesWith(ConvergenceControlToken); - IntrinsicCall->eraseFromParent(); - } + IntrinsicInst *IntrinsicCall = getConvergenceEntry(*FirstNewBlock); + if (IntrinsicCall) { + IntrinsicCall->replaceAllUsesWith(ConvergenceControlToken); + IntrinsicCall->eraseFromParent(); } } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 65c1669f92b4d..47a7049255961 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3508,6 +3508,17 @@ void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) { else if (!isa(I)) ReplInst->andIRFlags(I); + // Handle attributes. + if (auto *CB1 = dyn_cast(ReplInst)) { + if (auto *CB2 = dyn_cast(I)) { + bool Success = CB1->tryIntersectAttributes(CB2); + assert(Success && "We should not be trying to sink callbases " + "with non-intersectable attributes"); + // For NDEBUG Compile. + (void)Success; + } + } + // FIXME: If both the original and replacement value are part of the // same control-flow region (meaning that the execution of one // guarantees the execution of the other), then we can combine the diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index c65710ea7551a..4225e7e80fda6 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -1923,6 +1923,12 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) { return (void)mergeInValue(IV, &CB, CopyOfVal); } + if (II->getIntrinsicID() == Intrinsic::vscale) { + unsigned BitWidth = CB.getType()->getScalarSizeInBits(); + const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth); + return (void)mergeInValue(II, ValueLatticeElement::getRange(Result)); + } + if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) { // Compute result range for intrinsics supported by ConstantRange. // Do this even if we don't know a range for all operands, as we may diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index b2745c81dec88..7787f58683b2a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -506,7 +506,7 @@ class LoopVectorizationPlanner { // instructions leading from the loop exit instr to the phi need to be // converted to reductions, with one operand being vector and the other being // the scalar reduction chain. For other reductions, a select is introduced - // between the phi and live-out recipes when folding the tail. + // between the phi and users outside the vector region when folding the tail. void adjustRecipesForReductions(VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 150fc4a42b484..659b4c30a58ad 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -467,11 +467,12 @@ class InnerLoopVectorizer { ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) + ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, + VPlan &Plan) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), - PSI(PSI), RTChecks(RTChecks) { + PSI(PSI), RTChecks(RTChecks), Plan(Plan) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -497,8 +498,8 @@ class InnerLoopVectorizer { virtual std::pair createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); - /// Fix the vectorized code, taking care of header phi's, live-outs, and more. - void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); + /// Fix the vectorized code, taking care of header phi's, and more. + void fixVectorizedLoop(VPTransformState &State); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -513,7 +514,7 @@ class InnerLoopVectorizer { VPTransformState &State); /// Fix the non-induction PHIs in \p Plan. - void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); + void fixNonInductionPHIs(VPTransformState &State); /// Create a new phi node for the induction variable \p OrigPhi to resume /// iteration count in the scalar epilogue, from where the vectorized loop @@ -541,8 +542,7 @@ class InnerLoopVectorizer { /// Set up the values of the IVs correctly when exiting the vector loop. virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, - BasicBlock *MiddleBlock, VPlan &Plan, - VPTransformState &State); + BasicBlock *MiddleBlock, VPTransformState &State); /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. @@ -674,6 +674,8 @@ class InnerLoopVectorizer { /// Structure to hold information about generated runtime checks, responsible /// for cleaning the checks, if vectorization turns out unprofitable. GeneratedRTChecks &RTChecks; + + VPlan &Plan; }; /// Encapsulate information regarding vectorization of a loop and its epilogue. @@ -715,10 +717,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - GeneratedRTChecks &Checks) + GeneratedRTChecks &Checks, VPlan &Plan) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, - CM, BFI, PSI, Checks), + CM, BFI, PSI, Checks, Plan), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -755,9 +757,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - GeneratedRTChecks &Check) + GeneratedRTChecks &Check, VPlan &Plan) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI, Check) {} + EPI, LVL, CM, BFI, PSI, Check, Plan) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). std::pair @@ -773,7 +775,7 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, - BasicBlock *MiddleBlock, VPlan &Plan, + BasicBlock *MiddleBlock, VPTransformState &State) override {}; }; @@ -789,9 +791,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - GeneratedRTChecks &Checks) + GeneratedRTChecks &Checks, VPlan &Plan) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI, Checks) { + EPI, LVL, CM, BFI, PSI, Checks, Plan) { TripCount = EPI.TripCount; } /// Implements the interface for creating a vectorized skeleton using the @@ -2711,7 +2713,8 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton( | | (opt) v <-- edge from middle to exit iff epilogue is not required. | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). + | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header + | | wrapped in VPIRBasicBlock). \ | \ v >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) @@ -2751,7 +2754,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton( void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *VectorTripCount, Value *EndValue, - BasicBlock *MiddleBlock, VPlan &Plan, + BasicBlock *MiddleBlock, VPTransformState &State) { // There are two kinds of external IV usages - those that use the value // computed in the last iteration (the PHI) and those that use the penultimate @@ -2931,11 +2934,10 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, TargetTransformInfo::TCK_RecipThroughput); } -void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, - VPlan &Plan) { +void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Fix widened non-induction PHIs by setting up the PHI operands. if (EnableVPlanNativePath) - fixNonInductionPHIs(Plan, State); + fixNonInductionPHIs(State); // Forget the original basic block. PSE.getSE()->forgetLoop(OrigLoop); @@ -2955,7 +2957,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. } else { - // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking + // TODO: Check in VPlan to see if IV users need fixing instead of checking // the cost model. // If we inserted an edge from the middle block to the unique exit block, @@ -2966,13 +2968,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (const auto &Entry : Legal->getInductionVars()) fixupIVUsers(Entry.first, Entry.second, getOrCreateVectorTripCount(nullptr), - IVEndValues[Entry.first], LoopMiddleBlock, Plan, State); + IVEndValues[Entry.first], LoopMiddleBlock, State); } - // Fix live-out phis not already fixed earlier. - for (const auto &KV : Plan.getLiveOuts()) - KV.second->fixPhi(Plan, State); - for (Instruction *PI : PredicatedInstructions) sinkScalarOperands(&*PI); @@ -3077,8 +3075,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } -void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, - VPTransformState &State) { +void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { auto Iter = vp_depth_first_deep(Plan.getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &P : VPBB->phis()) { @@ -7706,8 +7703,7 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); // 2.5 Collect reduction resume values. - auto *ExitVPBB = - cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); + auto *ExitVPBB = BestVPlan.getMiddleBlock(); if (VectorizingEpilogue) for (VPRecipeBase &R : *ExitVPBB) { fixReductionScalarResumeWhenVectorizingEpilog( @@ -7744,7 +7740,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. - ILV.fixVectorizedLoop(State, BestVPlan); + ILV.fixVectorizedLoop(State); ILV.printDebugTracesAtEnd(); @@ -8790,6 +8786,41 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } +/// Create resume phis in the scalar preheader for first-order recurrences and +/// reductions and update the VPIRInstructions wrapping the original phis in the +/// scalar header. +static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { + auto *ScalarPH = Plan.getScalarPreheader(); + auto *MiddleVPBB = cast(ScalarPH->getSinglePredecessor()); + VPBuilder ScalarPHBuilder(ScalarPH); + VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); + VPValue *OneVPV = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) { + auto *ScalarPhiIRI = cast(&ScalarPhiR); + auto *ScalarPhiI = dyn_cast(&ScalarPhiIRI->getInstruction()); + if (!ScalarPhiI) + break; + auto *VectorPhiR = cast(Builder.getRecipe(ScalarPhiI)); + if (!isa(VectorPhiR)) + continue; + // The backedge value provides the value to resume coming out of a loop, + // which for FORs is a vector whose last element needs to be extracted. The + // start value provides the value if the loop is bypassed. + bool IsFOR = isa(VectorPhiR); + auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); + if (IsFOR) + ResumeFromVectorLoop = MiddleBuilder.createNaryOp( + VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {}, + "vector.recur.extract"); + StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx"; + auto *ResumePhiR = ScalarPHBuilder.createNaryOp( + VPInstruction::ResumePhi, + {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name); + ScalarPhiIRI->addOperand(ResumePhiR); + } +} + // Collect VPIRInstructions for phis in the original exit block that are modeled // in VPlan and add the exiting VPValue as operand. Some exiting values are not // modeled explicitly yet and won't be included. Those are un-truncated @@ -8798,8 +8829,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, static SetVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { - auto *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + auto *MiddleVPBB = Plan.getMiddleBlock(); // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. @@ -8819,8 +8849,7 @@ static SetVector collectUsersInExitBlock( VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); // Exit values for inductions are computed and updated outside of VPlan and // independent of induction recipes. - // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update - // live-outs. + // TODO: Compute induction exit values in VPlan. if ((isa(V) && !cast(V)->getTruncInst()) || isa(V) || @@ -8845,15 +8874,15 @@ addUsersInExitBlock(VPlan &Plan, if (ExitUsersToFix.empty()) return; - auto *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + auto *MiddleVPBB = Plan.getMiddleBlock(); VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); // Introduce extract for exiting values and update the VPIRInstructions // modeling the corresponding LCSSA phis. for (VPIRInstruction *ExitIRI : ExitUsersToFix) { VPValue *V = ExitIRI->getOperand(0); - // Pass live-in values used by exit phis directly through to the live-out. + // Pass live-in values used by exit phis directly through to their users in + // the exit block. if (V->isLiveIn()) continue; @@ -8865,39 +8894,17 @@ addUsersInExitBlock(VPlan &Plan, } } -/// Handle live-outs for first order reductions, both in the scalar preheader -/// and the original exit block: -/// 1. Feed a resume value for every FOR from the vector loop to the scalar -/// loop, if middle block branches to scalar preheader, by introducing -/// ExtractFromEnd and ResumePhi recipes in each, respectively, and a -/// VPLiveOut which uses the latter and corresponds to the scalar header. -/// 2. Feed the penultimate value of recurrences to their LCSSA phi users in -/// the original exit block using a VPLiveOut. -static void addLiveOutsForFirstOrderRecurrences( +/// Handle users in the exit block for first order reductions in the original +/// exit block. The penultimate value of recurrences is fed to their LCSSA phi +/// users in the original exit block using the VPIRInstruction wrapping to the +/// LCSSA phi. +static void addExitUsersForFirstOrderRecurrences( VPlan &Plan, SetVector &ExitUsersToFix) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); - - // Start by finding out if middle block branches to scalar preheader, which is - // not a VPIRBasicBlock, unlike Exit block - the other possible successor of - // middle block. - // TODO: Should be replaced by - // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the - // scalar region is modeled as well. - auto *MiddleVPBB = cast(VectorRegion->getSingleSuccessor()); - VPBasicBlock *ScalarPHVPBB = nullptr; - if (MiddleVPBB->getNumSuccessors() == 2) { - // Order is strict: first is the exit block, second is the scalar preheader. - ScalarPHVPBB = cast(MiddleVPBB->getSuccessors()[1]); - } else if (ExitUsersToFix.empty()) { - ScalarPHVPBB = cast(MiddleVPBB->getSingleSuccessor()); - } else { - llvm_unreachable("unsupported CFG in VPlan"); - } - + auto *ScalarPHVPBB = Plan.getScalarPreheader(); + auto *MiddleVPBB = Plan.getMiddleBlock(); VPBuilder ScalarPHBuilder(ScalarPHVPBB); VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); - VPValue *OneVPV = Plan.getOrAddLiveIn( - ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); VPValue *TwoVPV = Plan.getOrAddLiveIn( ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2)); @@ -8973,26 +8980,16 @@ static void addLiveOutsForFirstOrderRecurrences( // lo = lcssa.phi [s1, scalar.body], // [vector.recur.extract.for.phi, middle.block] // - // Extract the resume value and create a new VPLiveOut for it. - auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd, - {FOR->getBackedgeValue(), OneVPV}, - {}, "vector.recur.extract"); - auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( - VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {}, - "scalar.recur.init"); - auto *FORPhi = cast(FOR->getUnderlyingInstr()); - Plan.addLiveOut(FORPhi, ResumePhiRecipe); - // Now update VPIRInstructions modeling LCSSA phis in the exit block. // Extract the penultimate value of the recurrence and use it as operand for // the VPIRInstruction modeling the phi. for (VPIRInstruction *ExitIRI : ExitUsersToFix) { if (ExitIRI->getOperand(0) != FOR) continue; - VPValue *Ext = MiddleBuilder.createNaryOp( + VPValue *PenultimateElement = MiddleBuilder.createNaryOp( VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, "vector.recur.extract.for.phi"); - ExitIRI->setOperand(0, Ext); + ExitIRI->setOperand(0, PenultimateElement); ExitUsersToFix.remove(ExitIRI); } } @@ -9085,8 +9082,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); return Legal->blockNeedsPredication(BB) || NeedsBlends; }); - auto *MiddleVPBB = - cast(Plan->getVectorLoopRegion()->getSingleSuccessor()); + auto *MiddleVPBB = Plan->getMiddleBlock(); VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi(); for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe @@ -9166,11 +9162,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); + addScalarResumePhis(RecipeBuilder, *Plan); SetVector ExitUsersToFix = collectUsersInExitBlock( OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); - addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix); + addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); addUsersInExitBlock(*Plan, ExitUsersToFix); - // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. @@ -9192,9 +9188,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // Replace VPValues for known constant strides guaranteed by predicate scalar // evolution. auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { - auto *R = dyn_cast(&U); - if (!R) - return false; + auto *R = cast(&U); return R->getParent()->getParent() || R->getParent() == Plan->getVectorLoopRegion()->getSinglePredecessor(); @@ -9291,7 +9285,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // instructions leading from the loop exit instr to the phi need to be converted // to reductions, with one operand being vector and the other being the scalar // reduction chain. For other reductions, a select is introduced between the phi -// and live-out recipes when folding the tail. +// and users outside the vector region when folding the tail. // // A ComputeReductionResult recipe is added to the middle block, also for // in-loop reductions which compute their result in-loop, because generating @@ -9305,8 +9299,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( using namespace VPlanPatternMatch; VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); - VPBasicBlock *MiddleVPBB = - cast(VectorLoopRegion->getSingleSuccessor()); + VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock(); for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast(&R); if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) @@ -9325,8 +9318,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPUser *U : Cur->users()) { auto *UserRecipe = cast(U); if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { - assert(UserRecipe->getParent() == MiddleVPBB && - "U must be either in the loop region or the middle block."); + assert((UserRecipe->getParent() == MiddleVPBB || + UserRecipe->getParent() == Plan->getScalarPreheader()) && + "U must be either in the loop region, the middle block or the " + "scalar preheader."); continue; } Worklist.insert(UserRecipe); @@ -9440,8 +9435,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); // If tail is folded by masking, introduce selects between the phi - // and the live-out instruction of each reduction, at the beginning of the - // dedicated latch block. + // and the users outside the vector region of each reduction, at the + // beginning of the dedicated latch block. auto *OrigExitingVPV = PhiR->getBackedgeValue(); auto *NewExitingVPV = PhiR->getBackedgeValue(); if (!PhiR->isInLoop() && CM.foldTailByMasking()) { @@ -9513,17 +9508,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( }); FinalReductionResult->insertBefore(*MiddleVPBB, IP); - // Order is strict: if there are multiple successors, the first is the exit - // block, second is the scalar preheader. - VPBasicBlock *ScalarPHVPBB = - cast(MiddleVPBB->getSuccessors().back()); - VPBuilder ScalarPHBuilder(ScalarPHVPBB); - auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( - VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()}, - {}, "bc.merge.rdx"); - auto *RedPhi = cast(PhiR->getUnderlyingInstr()); - Plan->addLiveOut(RedPhi, ResumePhiRecipe); - // Adjust AnyOf reductions; replace the reduction phi for the selected value // with a boolean reduction phi node to check if the condition is true in // any iteration. The final value is selected by the final @@ -9727,7 +9711,7 @@ static bool processLoopInVPlanNativePath( GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), AddBranchWeights); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.Width, 1, LVL, &CM, BFI, PSI, Checks); + VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); @@ -10215,11 +10199,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. + VPlan &BestPlan = LVP.getPlanFor(VF.Width); InnerLoopVectorizer Unroller( L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), - ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks); + ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan); - VPlan &BestPlan = LVP.getPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { @@ -10236,15 +10220,16 @@ bool LoopVectorizePass::processLoop(Loop *L) { VectorizationFactor EpilogueVF = LVP.selectEpilogueVectorizationFactor(VF.Width, IC); if (EpilogueVF.Width.isVector()) { + std::unique_ptr BestMainPlan(BestPlan.duplicate()); // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, &LVL, &CM, BFI, PSI, Checks); + EPI, &LVL, &CM, BFI, PSI, Checks, + *BestMainPlan); - std::unique_ptr BestMainPlan(BestPlan.duplicate()); auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, false); ++LoopsVectorized; @@ -10253,11 +10238,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; + VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF); EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, - Checks); + Checks, BestEpiPlan); - VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF); VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); @@ -10340,7 +10325,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, - PSI, Checks); + PSI, Checks, BestPlan); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 268546fe99e13..248a107ded514 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8160,9 +8160,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, auto *VL0 = cast(S.OpValue); BB = VL0->getParent(); - if (S.MainOp && !DT->isReachableFromEntry(BB)) { + if (S.MainOp && + (BB->isEHPad() || isa_and_nonnull(BB->getTerminator()) || + !DT->isReachableFromEntry(BB))) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. + // Do not vectorize EH and non-returning blocks, not profitable in most + // cases. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; @@ -11977,11 +11981,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { if (EphValues.count(EU.User)) continue; - // Used in unreachable blocks or in landing pads (rarely executed). + // Used in unreachable blocks or in EH pads (rarely executed) or is + // terminated with unreachable instruction. if (BasicBlock *UserParent = EU.User ? cast(EU.User)->getParent() : nullptr; UserParent && - (!DT->isReachableFromEntry(UserParent) || UserParent->isLandingPad())) + (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() || + isa_and_present(UserParent->getTerminator()))) continue; // We only add extract cost once for the same scalar. @@ -13808,13 +13814,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; + unsigned ScalarTyNumElements = getNumElements(ScalarTy); SmallVector NewExtMask(ExtMask); - if (auto *VecTy = dyn_cast(ScalarTy)) { + if (ScalarTyNumElements != 1) { assert(SLPReVec && "FixedVectorType is not expected."); - transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), - CommonMask); - transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), - NewExtMask); + transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask); + transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask); ExtMask = NewExtMask; } if (Action) { @@ -13857,12 +13862,14 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { return !isKnownNonNegative( V, SimplifyQuery(*R.DL)); })); + unsigned InsertionIndex = Idx * ScalarTyNumElements; Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V, - Builder.getInt64(Idx)); + Builder.getInt64(InsertionIndex)); if (!CommonMask.empty()) { - std::iota(std::next(CommonMask.begin(), Idx), - std::next(CommonMask.begin(), Idx + E->getVectorFactor()), - Idx); + std::iota(std::next(CommonMask.begin(), InsertionIndex), + std::next(CommonMask.begin(), (Idx + E->getVectorFactor()) * + ScalarTyNumElements), + InsertionIndex); } } InVectors.front() = Vec; @@ -16127,11 +16134,13 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, if (IE->Idx != 0 && !(VectorizableTree.front()->isGather() && !IE->UserTreeIndices.empty() && - any_of(IE->UserTreeIndices, - [&](const EdgeInfo &EI) { - return EI.UserTE == VectorizableTree.front().get() && - EI.EdgeIdx == UINT_MAX; - })) && + (ValueToGatherNodes.lookup(I).contains( + VectorizableTree.front().get()) || + any_of(IE->UserTreeIndices, + [&](const EdgeInfo &EI) { + return EI.UserTE == VectorizableTree.front().get() && + EI.EdgeIdx == UINT_MAX; + }))) && !(GatheredLoadsEntriesFirst.has_value() && IE->Idx >= *GatheredLoadsEntriesFirst && VectorizableTree.front()->isGather() && @@ -17728,6 +17737,9 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // Scan the blocks in the function in post order. for (auto *BB : post_order(&F.getEntryBlock())) { + if (BB->isEHPad() || isa_and_nonnull(BB->getTerminator())) + continue; + // Start new block - clear the list of reduction roots. R.clearReductionData(); collectSeedInstructions(BB); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 0484543d2d039..7c06fb2353822 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -455,11 +455,16 @@ void VPIRBasicBlock::execute(VPTransformState *State) { "VPIRBasicBlock can have at most two successors at the moment!"); State->Builder.SetInsertPoint(IRBB->getTerminator()); executeRecipes(State, IRBB); - if (getSingleSuccessor()) { - assert(isa(IRBB->getTerminator())); + // Create a branch instruction to terminate IRBB if one was not created yet + // and is needed. + if (getSingleSuccessor() && isa(IRBB->getTerminator())) { auto *Br = State->Builder.CreateBr(IRBB); Br->setOperand(0, nullptr); IRBB->getTerminator()->eraseFromParent(); + } else { + assert( + (getNumSuccessors() == 0 || isa(IRBB->getTerminator())) && + "other blocks must be terminated by a branch"); } for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { @@ -474,7 +479,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) { // backedges. A backward successor is set when the branch is created. const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; - assert(!TermBr->getSuccessor(idx) && + assert((!TermBr->getSuccessor(idx) || TermBr->getSuccessor(idx) == IRBB) && "Trying to reset an existing successor block."); TermBr->setSuccessor(idx, IRBB); State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}}); @@ -843,10 +848,6 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, #endif VPlan::~VPlan() { - for (auto &KV : LiveOuts) - delete KV.second; - LiveOuts.clear(); - if (Entry) { VPValue DummyValue; for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) @@ -878,7 +879,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, VPIRBasicBlock *Entry = VPIRBasicBlock::fromBasicBlock(TheLoop->getLoopPreheader()); VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique(Entry, VecPreheader); + VPIRBasicBlock *ScalarHeader = + VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader()); + auto Plan = std::make_unique(Entry, VecPreheader, ScalarHeader); // Create SCEV and VPValue for the trip count. @@ -909,6 +912,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); + VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader); if (!RequiresScalarEpilogueCheck) { VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); return Plan; @@ -1030,21 +1034,9 @@ void VPlan::execute(VPTransformState *State) { // skeleton creation, so we can only create the VPIRBasicBlocks now during // VPlan execution rather than earlier during VPlan construction. BasicBlock *MiddleBB = State->CFG.ExitBB; - VPBasicBlock *MiddleVPBB = - cast(getVectorLoopRegion()->getSingleSuccessor()); - // Find the VPBB for the scalar preheader, relying on the current structure - // when creating the middle block and its successrs: if there's a single - // predecessor, it must be the scalar preheader. Otherwise, the second - // successor is the scalar preheader. + VPBasicBlock *MiddleVPBB = getMiddleBlock(); BasicBlock *ScalarPh = MiddleBB->getSingleSuccessor(); - auto &MiddleSuccs = MiddleVPBB->getSuccessors(); - assert((MiddleSuccs.size() == 1 || MiddleSuccs.size() == 2) && - "middle block has unexpected successors"); - VPBasicBlock *ScalarPhVPBB = cast( - MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]); - assert(!isa(ScalarPhVPBB) && - "scalar preheader cannot be wrapped already"); - replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh); + replaceVPBBWithIRVPBB(getScalarPreheader(), ScalarPh); replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB); // Disconnect the middle block from its single successor (the scalar loop @@ -1054,6 +1046,11 @@ void VPlan::execute(VPTransformState *State) { BrInst->insertBefore(MiddleBB->getTerminator()); MiddleBB->getTerminator()->eraseFromParent(); State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}}); + // Disconnect scalar preheader and scalar header, as the dominator tree edge + // will be updated as part of VPlan execution. This allows keeping the DTU + // logic generic during VPlan execution. + State->CFG.DTU.applyUpdates( + {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}}); // Generate code in the loop pre-header and body. for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) @@ -1172,12 +1169,6 @@ void VPlan::print(raw_ostream &O) const { Block->print(O, "", SlotTracker); } - if (!LiveOuts.empty()) - O << "\n"; - for (const auto &KV : LiveOuts) { - KV.second->print(O, SlotTracker); - } - O << "}\n"; } @@ -1214,11 +1205,6 @@ LLVM_DUMP_METHOD void VPlan::dump() const { print(dbgs()); } #endif -void VPlan::addLiveOut(PHINode *PN, VPValue *V) { - assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists"); - LiveOuts.insert({PN, new VPLiveOut(PN, V)}); -} - static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry, DenseMap &Old2NewVPValues) { // Update the operands of all cloned recipes starting at NewEntry. This @@ -1262,8 +1248,15 @@ VPlan *VPlan::duplicate() { VPBasicBlock *NewPreheader = Preheader->clone(); const auto &[NewEntry, __] = cloneFrom(Entry); + BasicBlock *ScalarHeaderIRBB = getScalarHeader()->getIRBasicBlock(); + VPIRBasicBlock *NewScalarHeader = cast(*find_if( + vp_depth_first_shallow(NewEntry), [ScalarHeaderIRBB](VPBlockBase *VPB) { + auto *VPIRBB = dyn_cast(VPB); + return VPIRBB && VPIRBB->getIRBasicBlock() == ScalarHeaderIRBB; + })); // Create VPlan, clone live-ins and remap operands in the cloned blocks. - auto *NewPlan = new VPlan(NewPreheader, cast(NewEntry)); + auto *NewPlan = + new VPlan(NewPreheader, cast(NewEntry), NewScalarHeader); DenseMap Old2NewVPValues; for (VPValue *OldLiveIn : VPLiveInsToFree) { Old2NewVPValues[OldLiveIn] = @@ -1286,10 +1279,6 @@ VPlan *VPlan::duplicate() { remapOperands(Preheader, NewPreheader, Old2NewVPValues); remapOperands(Entry, NewEntry, Old2NewVPValues); - // Clone live-outs. - for (const auto &[_, LO] : LiveOuts) - NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]); - // Initialize remaining fields of cloned VPlan. NewPlan->VFs = VFs; NewPlan->UFs = UFs; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0e0c64f6df9cb..cf4b38b340dc1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -675,48 +675,6 @@ class VPBlockBase { virtual VPBlockBase *clone() = 0; }; -/// A value that is used outside the VPlan. The operand of the user needs to be -/// added to the associated phi node. The incoming block from VPlan is -/// determined by where the VPValue is defined: if it is defined by a recipe -/// outside a region, its parent block is used, otherwise the middle block is -/// used. -class VPLiveOut : public VPUser { - PHINode *Phi; - -public: - VPLiveOut(PHINode *Phi, VPValue *Op) - : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {} - - static inline bool classof(const VPUser *U) { - return U->getVPUserID() == VPUser::VPUserID::LiveOut; - } - - /// Fix the wrapped phi node. This means adding an incoming value to exit - /// block phi's from the vector loop via middle block (values from scalar loop - /// already reach these phi's), and updating the value to scalar header phi's - /// from the scalar preheader. - void fixPhi(VPlan &Plan, VPTransformState &State); - - /// Returns true if the VPLiveOut uses scalars of operand \p Op. - bool usesScalars(const VPValue *Op) const override { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - return true; - } - - PHINode *getPhi() const { return Phi; } - - /// Live-outs are marked as only using the first part during the transition - /// to unrolling directly on VPlan. - /// TODO: Remove after unroller transition. - bool onlyFirstPartUsed(const VPValue *Op) const override { return true; } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the VPLiveOut to \p O. - void print(raw_ostream &O, VPSlotTracker &SlotTracker) const; -#endif -}; - /// Struct to hold various analysis needed for cost computations. struct VPCostContext { const TargetTransformInfo &TTI; @@ -763,12 +721,12 @@ class VPRecipeBase : public ilist_node_with_parent, public: VPRecipeBase(const unsigned char SC, ArrayRef Operands, DebugLoc DL = {}) - : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {} + : VPDef(SC), VPUser(Operands), DL(DL) {} template VPRecipeBase(const unsigned char SC, iterator_range Operands, DebugLoc DL = {}) - : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {} + : VPDef(SC), VPUser(Operands), DL(DL) {} virtual ~VPRecipeBase() = default; /// Clone the current recipe. @@ -822,9 +780,7 @@ class VPRecipeBase : public ilist_node_with_parent, return true; } - static inline bool classof(const VPUser *U) { - return U->getVPUserID() == VPUser::VPUserID::Recipe; - } + static inline bool classof(const VPUser *U) { return true; } /// Returns true if the recipe may have side-effects. bool mayHaveSideEffects() const; @@ -1465,6 +1421,12 @@ class VPIRInstruction : public VPRecipeBase { "Op must be an operand of the recipe"); return true; } + + bool onlyFirstPartUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } }; /// VPWidenRecipe is a recipe for producing a widened instruction using the @@ -2801,7 +2763,8 @@ class VPPredInstPHIRecipe : public VPSingleDefRecipe { VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC) - /// Generates phi nodes for live-outs as needed to retain SSA form. + /// Generates phi nodes for live-outs (from a replicate region) as needed to + /// retain SSA form. void execute(VPTransformState &State) override; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3676,6 +3639,9 @@ class VPlan { /// rest of VPlan execution. VPBasicBlock *Preheader; + /// VPIRBasicBlock wrapping the header of the original scalar loop. + VPIRBasicBlock *ScalarHeader; + /// Holds the VFs applicable to this VPlan. SmallSetVector VFs; @@ -3711,37 +3677,38 @@ class VPlan { /// definitions are VPValues that hold a pointer to their underlying IR. SmallVector VPLiveInsToFree; - /// Values used outside the plan. It contains live-outs that need fixing. Any - /// live-out that is fixed outside VPlan needs to be removed. The remaining - /// live-outs are fixed via VPLiveOut::fixPhi. - MapVector LiveOuts; - /// Mapping from SCEVs to the VPValues representing their expansions. /// NOTE: This mapping is temporary and will be removed once all users have /// been modeled in VPlan directly. DenseMap SCEVToExpansion; public: - /// Construct a VPlan with original preheader \p Preheader, trip count \p TC - /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to - /// be disconnected, as the bypass blocks between them are not yet modeled in + /// Construct a VPlan with original preheader \p Preheader, trip count \p TC, + /// \p Entry to the plan and with \p ScalarHeader wrapping the original header + /// of the scalar loop. At the moment, \p Preheader and \p Entry need to be + /// disconnected, as the bypass blocks between them are not yet modeled in /// VPlan. - VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry) - : VPlan(Preheader, Entry) { + VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry, + VPIRBasicBlock *ScalarHeader) + : VPlan(Preheader, Entry, ScalarHeader) { TripCount = TC; } - /// Construct a VPlan with original preheader \p Preheader and \p Entry to - /// the plan. At the moment, \p Preheader and \p Entry need to be + /// Construct a VPlan with original preheader \p Preheader, \p Entry to + /// the plan and with \p ScalarHeader wrapping the original header of the + /// scalar loop. At the moment, \p Preheader and \p Entry need to be /// disconnected, as the bypass blocks between them are not yet modeled in /// VPlan. - VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry) - : Entry(Entry), Preheader(Preheader) { + VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry, + VPIRBasicBlock *ScalarHeader) + : Entry(Entry), Preheader(Preheader), ScalarHeader(ScalarHeader) { Entry->setPlan(this); Preheader->setPlan(this); assert(Preheader->getNumSuccessors() == 0 && Preheader->getNumPredecessors() == 0 && "preheader must be disconnected"); + assert(ScalarHeader->getNumSuccessors() == 0 && + "scalar header must be a leaf node"); } ~VPlan(); @@ -3773,6 +3740,24 @@ class VPlan { VPBasicBlock *getEntry() { return Entry; } const VPBasicBlock *getEntry() const { return Entry; } + /// Return the VPIRBasicBlock wrapping the header of the scalar loop. + VPIRBasicBlock *getScalarHeader() const { return ScalarHeader; } + + /// Return the VPBasicBlock for the preheader of the scalar loop. + VPBasicBlock *getScalarPreheader() const { + return cast(ScalarHeader->getSinglePredecessor()); + } + + /// Returns the 'middle' block of the plan, that is the block that selects + /// whether to execute the scalar tail loop or the exit block from the loop + /// latch. + const VPBasicBlock *getMiddleBlock() const { + return cast(getVectorLoopRegion()->getSingleSuccessor()); + } + VPBasicBlock *getMiddleBlock() { + return cast(getVectorLoopRegion()->getSingleSuccessor()); + } + /// The trip count of the original loop. VPValue *getTripCount() const { assert(TripCount && "trip count needs to be set before accessing it"); @@ -3900,12 +3885,6 @@ class VPlan { return cast(&*EntryVPBB->begin()); } - void addLiveOut(PHINode *PN, VPValue *V); - - const MapVector &getLiveOuts() const { - return LiveOuts; - } - VPValue *getSCEVExpansion(const SCEV *S) const { return SCEVToExpansion.lookup(S); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index de7023167df89..2ecd546633825 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -214,35 +214,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { } } -void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { - VPValue *ExitValue = getOperand(0); - VPBasicBlock *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); - VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); - auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr; - // Values leaving the vector loop reach live out phi's in the exiting block - // via middle block. - auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion() - ? MiddleVPBB - : ExitingVPBB; - BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; - Value *V = State.get(ExitValue, VPLane(0)); - if (Phi->getBasicBlockIndex(PredBB) != -1) - Phi->setIncomingValueForBlock(PredBB, V); - else - Phi->addIncoming(V, PredBB); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { - O << "Live-out "; - getPhi()->printAsOperand(O); - O << " = "; - getOperand(0)->printAsOperand(O, SlotTracker); - O << "\n"; -} -#endif - void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && @@ -873,7 +844,12 @@ void VPIRInstruction::execute(VPTransformState &State) { State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); Value *V = State.get(ExitValue, VPLane(Lane)); auto *Phi = cast(&I); - Phi->addIncoming(V, PredBB); + // If there is no existing block for PredBB in the phi, add a new incoming + // value. Otherwise update the existing incoming value for PredBB. + if (Phi->getBasicBlockIndex(PredBB) == -1) + Phi->addIncoming(V, PredBB); + else + Phi->setIncomingValueForBlock(PredBB, V); } // Advance the insert point after the wrapped IR instruction. This allows diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 355781f955052..622b2592f3e09 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -151,9 +151,7 @@ static bool sinkScalarOperands(VPlan &Plan) { // SinkCandidate. auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, SinkCandidate](VPUser *U) { - auto *UI = dyn_cast(U); - if (!UI) - return false; + auto *UI = cast(U); if (UI->getParent() == SinkTo) return true; NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate); @@ -280,8 +278,7 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { cast(&Phi1ToMove)->getOperand(0); VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) { - auto *UI = dyn_cast(&U); - return UI && UI->getParent() == Then2; + return cast(&U)->getParent() == Then2; }); // Remove phi recipes that are unused after merging the regions. @@ -376,10 +373,10 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) { SmallVector WorkList; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_deep(Plan.getEntry()))) { - // Don't fold the exit block of the Plan into its single predecessor for - // now. + // Don't fold the blocks in the skeleton of the Plan into their single + // predecessors for now. // TODO: Remove restriction once more of the skeleton is modeled in VPlan. - if (VPBB->getNumSuccessors() == 0 && !VPBB->getParent()) + if (!VPBB->getParent()) continue; auto *PredVPBB = dyn_cast_or_null(VPBB->getSinglePredecessor()); @@ -750,9 +747,8 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, "only recipes with a single defined value expected"); for (VPUser *User : Current->getVPSingleValue()->users()) { - if (auto *R = dyn_cast(User)) - if (!TryToPushSinkCandidate(R)) - return false; + if (!TryToPushSinkCandidate(cast(User))) + return false; } } @@ -786,16 +782,14 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, // Find the closest hoist point by looking at all users of FOR and selecting // the recipe dominating all other users. for (VPUser *U : FOR->users()) { - auto *R = dyn_cast(U); - if (!R) - continue; + auto *R = cast(U); if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint)) HoistPoint = R; } assert(all_of(FOR->users(), [&VPDT, HoistPoint](VPUser *U) { - auto *R = dyn_cast(U); - return !R || HoistPoint == R || + auto *R = cast(U); + return HoistPoint == R || VPDT.properlyDominates(HoistPoint, R); }) && "HoistPoint must dominate all users of FOR"); @@ -922,8 +916,8 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, static SmallVector collectUsersRecursively(VPValue *V) { SetVector Users(V->user_begin(), V->user_end()); for (unsigned I = 0; I != Users.size(); ++I) { - VPRecipeBase *Cur = dyn_cast(Users[I]); - if (!Cur || isa(Cur)) + VPRecipeBase *Cur = cast(Users[I]); + if (isa(Cur)) continue; for (VPValue *V : Cur->definedValues()) Users.insert(V->user_begin(), V->user_end()); @@ -1044,9 +1038,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { R.getParent()->getPlan()->getCanonicalIV()->getScalarType()); assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); for (VPUser *U : A->users()) { - auto *R = dyn_cast(U); - if (!R) - continue; + auto *R = cast(U); for (VPValue *VPV : R->definedValues()) assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); } @@ -1455,9 +1447,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { for (VPUser *U : collectUsersRecursively(HeaderMask)) { - auto *CurRecipe = dyn_cast(U); - if (!CurRecipe) - continue; + auto *CurRecipe = cast(U); auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { assert(OrigMask && "Unmasked recipe when folding tail"); return HeaderMask == OrigMask ? nullptr : OrigMask; diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 1e32865e8ee57..15dcf4dc0d91e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -449,11 +449,5 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { Part++; } - // Remap the operand of live-outs to the last part. - for (const auto &[_, LO] : Plan.getLiveOuts()) { - VPValue *In = Unroller.getValueForPart(LO->getOperand(0), UF - 1); - LO->setOperand(0, In); - } - VPlanTransforms::removeDeadRecipes(Plan); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 89b3ed72b8eb6..691b0d40823cf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -39,8 +39,8 @@ class VPRecipeBase; // This is the base class of the VPlan Def/Use graph, used for modeling the data // flow into, within and out of the VPlan. VPValues can stand for live-ins -// coming from the input IR, instructions which VPlan will generate if executed -// and live-outs which the VPlan will need to fix accordingly. +// coming from the input IR and instructions which VPlan will generate if +// executed. class VPValue { friend class VPBuilder; friend class VPDef; @@ -198,34 +198,23 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V); /// This class augments VPValue with operands which provide the inverse def-use /// edges from VPValue's users to their defs. class VPUser { -public: - /// Subclass identifier (for isa/dyn_cast). - enum class VPUserID { - Recipe, - LiveOut, - }; - -private: SmallVector Operands; - VPUserID ID; - protected: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the operands to \p O. void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const; #endif - VPUser(ArrayRef Operands, VPUserID ID) : ID(ID) { + VPUser(ArrayRef Operands) { for (VPValue *Operand : Operands) addOperand(Operand); } - VPUser(std::initializer_list Operands, VPUserID ID) - : VPUser(ArrayRef(Operands), ID) {} + VPUser(std::initializer_list Operands) + : VPUser(ArrayRef(Operands)) {} - template - VPUser(iterator_range Operands, VPUserID ID) : ID(ID) { + template VPUser(iterator_range Operands) { for (VPValue *Operand : Operands) addOperand(Operand); } @@ -239,8 +228,6 @@ class VPUser { Op->removeUser(*this); } - VPUserID getVPUserID() const { return ID; } - void addOperand(VPValue *Operand) { Operands.push_back(Operand); Operand->addUser(*this); diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 7ea5ee341cc54..3b7ba61454899 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -248,14 +248,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { return false; } - VPBlockBase *MiddleBB = - IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor(); - if (IRBB != IRBB->getPlan()->getPreheader() && - IRBB->getSinglePredecessor() != MiddleBB) { - errs() << "VPIRBasicBlock can only be used as pre-header or a successor of " - "middle-block at the moment!\n"; - return false; - } return true; } @@ -420,12 +412,6 @@ bool VPlanVerifier::verify(const VPlan &Plan) { return false; } - for (const auto &KV : Plan.getLiveOuts()) - if (KV.second->getNumOperands() != 1) { - errs() << "live outs must have a single operand\n"; - return false; - } - return true; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 58145c7e3c591..3283cc8a229e5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -112,6 +112,7 @@ class VectorCombine { bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + bool foldPermuteOfBinops(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleOfCastops(Instruction &I); bool foldShuffleOfShuffles(Instruction &I); @@ -1400,6 +1401,100 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return true; } +/// Try to convert "shuffle (binop (shuffle, shuffle)), undef" +/// --> "binop (shuffle), (shuffle)". +bool VectorCombine::foldPermuteOfBinops(Instruction &I) { + BinaryOperator *BinOp; + ArrayRef OuterMask; + if (!match(&I, + m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask)))) + return false; + + // Don't introduce poison into div/rem. + if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem)) + return false; + + Value *Op00, *Op01; + ArrayRef Mask0; + if (!match(BinOp->getOperand(0), + m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))))) + return false; + + Value *Op10, *Op11; + ArrayRef Mask1; + if (!match(BinOp->getOperand(1), + m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))))) + return false; + + Instruction::BinaryOps Opcode = BinOp->getOpcode(); + auto *ShuffleDstTy = dyn_cast(I.getType()); + auto *BinOpTy = dyn_cast(BinOp->getType()); + auto *Op0Ty = dyn_cast(Op00->getType()); + auto *Op1Ty = dyn_cast(Op10->getType()); + if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty) + return false; + + unsigned NumSrcElts = BinOpTy->getNumElements(); + + // Don't accept shuffles that reference the second operand in + // div/rem or if its an undef arg. + if ((BinOp->isIntDivRem() || !isa(I.getOperand(1))) && + any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; })) + return false; + + // Merge outer / inner shuffles. + SmallVector NewMask0, NewMask1; + for (int M : OuterMask) { + if (M < 0 || M >= (int)NumSrcElts) { + NewMask0.push_back(PoisonMaskElem); + NewMask1.push_back(PoisonMaskElem); + } else { + NewMask0.push_back(Mask0[M]); + NewMask1.push_back(Mask1[M]); + } + } + + // Try to merge shuffles across the binop if the new shuffles are not costly. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + InstructionCost OldCost = + TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy, + OuterMask, CostKind, 0, nullptr, {BinOp}, &I) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, Mask0, + CostKind, 0, nullptr, {Op00, Op01}, + cast(BinOp->getOperand(0))) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, Mask1, + CostKind, 0, nullptr, {Op10, Op11}, + cast(BinOp->getOperand(1))); + + InstructionCost NewCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0, + CostKind, 0, nullptr, {Op00, Op01}) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, NewMask1, + CostKind, 0, nullptr, {Op10, Op11}) + + TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind); + + LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); + if (NewCost >= OldCost) + return false; + + Value *Shuf0 = Builder.CreateShuffleVector(Op00, Op01, NewMask0); + Value *Shuf1 = Builder.CreateShuffleVector(Op10, Op11, NewMask1); + Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); + + // Intersect flags from the old binops. + if (auto *NewInst = dyn_cast(NewBO)) + NewInst->copyIRFlags(BinOp); + + Worklist.pushValue(Shuf0); + Worklist.pushValue(Shuf1); + replaceValue(I, *NewBO); + return true; +} + /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)". bool VectorCombine::foldShuffleOfBinops(Instruction &I) { BinaryOperator *B0, *B1; @@ -2736,6 +2831,7 @@ bool VectorCombine::run() { MadeChange |= foldInsExtFNeg(I); break; case Instruction::ShuffleVector: + MadeChange |= foldPermuteOfBinops(I); MadeChange |= foldShuffleOfBinops(I); MadeChange |= foldShuffleOfCastops(I); MadeChange |= foldShuffleOfShuffles(I); diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll index 6e4061a42bf9b..0b2c8da4438da 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll @@ -30,20 +30,20 @@ define void @fabs() { call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) call <16 x float> @llvm.fabs.v16f32(<16 x float> undef) - call @llvm.fabs.nvx1f32( undef) - call @llvm.fabs.nvx2f32( undef) - call @llvm.fabs.nvx4f32( undef) - call @llvm.fabs.nvx8f32( undef) - call @llvm.fabs.nvx16f32( undef) + call @llvm.fabs.nxv1f32( undef) + call @llvm.fabs.nxv2f32( undef) + call @llvm.fabs.nxv4f32( undef) + call @llvm.fabs.nxv8f32( undef) + call @llvm.fabs.nxv16f32( undef) call double @llvm.fabs.f64(double undef) call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) call <8 x double> @llvm.fabs.v8f64(<8 x double> undef) call <16 x double> @llvm.fabs.v16f64(<16 x double> undef) - call @llvm.fabs.nvx1f64( undef) - call @llvm.fabs.nvx2f64( undef) - call @llvm.fabs.nvx4f64( undef) - call @llvm.fabs.nvx8f64( undef) + call @llvm.fabs.nxv1f64( undef) + call @llvm.fabs.nxv2f64( undef) + call @llvm.fabs.nxv4f64( undef) + call @llvm.fabs.nxv8f64( undef) ret void } @@ -65,10 +65,10 @@ define void @fabs_f16() { call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) call <8 x half> @llvm.fabs.v8f16(<8 x half> undef) call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) - call @llvm.fabs.nvx2f16( undef) - call @llvm.fabs.nvx4f16( undef) - call @llvm.fabs.nvx8f16( undef) - call @llvm.fabs.nvx16f16( undef) + call @llvm.fabs.nxv2f16( undef) + call @llvm.fabs.nxv4f16( undef) + call @llvm.fabs.nxv8f16( undef) + call @llvm.fabs.nxv16f16( undef) ret void } @@ -100,20 +100,20 @@ define void @minnum() { call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef) call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef) call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef) - call @llvm.minnum.nvx1f32( undef, undef) - call @llvm.minnum.nvx2f32( undef, undef) - call @llvm.minnum.nvx4f32( undef, undef) - call @llvm.minnum.nvx8f32( undef, undef) - call @llvm.minnum.nvx16f32( undef, undef) + call @llvm.minnum.nxv1f32( undef, undef) + call @llvm.minnum.nxv2f32( undef, undef) + call @llvm.minnum.nxv4f32( undef, undef) + call @llvm.minnum.nxv8f32( undef, undef) + call @llvm.minnum.nxv16f32( undef, undef) call double @llvm.minnum.f64(double undef, double undef) call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef) - call @llvm.minnum.nvx1f64( undef, undef) - call @llvm.minnum.nvx2f64( undef, undef) - call @llvm.minnum.nvx4f64( undef, undef) - call @llvm.minnum.nvx8f64( undef, undef) + call @llvm.minnum.nxv1f64( undef, undef) + call @llvm.minnum.nxv2f64( undef, undef) + call @llvm.minnum.nxv4f64( undef, undef) + call @llvm.minnum.nxv8f64( undef, undef) ret void } @@ -149,11 +149,11 @@ define void @minnum_f16() { call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef) call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef) call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef) - call @llvm.minnum.nvx1f16( undef, undef) - call @llvm.minnum.nvx2f16( undef, undef) - call @llvm.minnum.nvx4f16( undef, undef) - call @llvm.minnum.nvx8f16( undef, undef) - call @llvm.minnum.nvx16f16( undef, undef) + call @llvm.minnum.nxv1f16( undef, undef) + call @llvm.minnum.nxv2f16( undef, undef) + call @llvm.minnum.nxv4f16( undef, undef) + call @llvm.minnum.nxv8f16( undef, undef) + call @llvm.minnum.nxv16f16( undef, undef) ret void } @@ -185,20 +185,20 @@ define void @maxnum() { call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef) call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef) call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef) - call @llvm.maxnum.nvx1f32( undef, undef) - call @llvm.maxnum.nvx2f32( undef, undef) - call @llvm.maxnum.nvx4f32( undef, undef) - call @llvm.maxnum.nvx8f32( undef, undef) - call @llvm.maxnum.nvx16f32( undef, undef) + call @llvm.maxnum.nxv1f32( undef, undef) + call @llvm.maxnum.nxv2f32( undef, undef) + call @llvm.maxnum.nxv4f32( undef, undef) + call @llvm.maxnum.nxv8f32( undef, undef) + call @llvm.maxnum.nxv16f32( undef, undef) call double @llvm.maxnum.f64(double undef, double undef) call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef) - call @llvm.maxnum.nvx1f64( undef, undef) - call @llvm.maxnum.nvx2f64( undef, undef) - call @llvm.maxnum.nvx4f64( undef, undef) - call @llvm.maxnum.nvx8f64( undef, undef) + call @llvm.maxnum.nxv1f64( undef, undef) + call @llvm.maxnum.nxv2f64( undef, undef) + call @llvm.maxnum.nxv4f64( undef, undef) + call @llvm.maxnum.nxv8f64( undef, undef) ret void } @@ -234,11 +234,11 @@ define void @maxnum_f16() { call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef) call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef) call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef) - call @llvm.maxnum.nvx1f16( undef, undef) - call @llvm.maxnum.nvx2f16( undef, undef) - call @llvm.maxnum.nvx4f16( undef, undef) - call @llvm.maxnum.nvx8f16( undef, undef) - call @llvm.maxnum.nvx16f16( undef, undef) + call @llvm.maxnum.nxv1f16( undef, undef) + call @llvm.maxnum.nxv2f16( undef, undef) + call @llvm.maxnum.nxv4f16( undef, undef) + call @llvm.maxnum.nxv8f16( undef, undef) + call @llvm.maxnum.nxv16f16( undef, undef) ret void } @@ -270,20 +270,20 @@ define void @minimum() { call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef) call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef) call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef) - call @llvm.minimum.nvx1f32( undef, undef) - call @llvm.minimum.nvx2f32( undef, undef) - call @llvm.minimum.nvx4f32( undef, undef) - call @llvm.minimum.nvx8f32( undef, undef) - call @llvm.minimum.nvx16f32( undef, undef) + call @llvm.minimum.nxv1f32( undef, undef) + call @llvm.minimum.nxv2f32( undef, undef) + call @llvm.minimum.nxv4f32( undef, undef) + call @llvm.minimum.nxv8f32( undef, undef) + call @llvm.minimum.nxv16f32( undef, undef) call double @llvm.minimum.f64(double undef, double undef) call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef) call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef) call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef) call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef) - call @llvm.minimum.nvx1f64( undef, undef) - call @llvm.minimum.nvx2f64( undef, undef) - call @llvm.minimum.nvx4f64( undef, undef) - call @llvm.minimum.nvx8f64( undef, undef) + call @llvm.minimum.nxv1f64( undef, undef) + call @llvm.minimum.nxv2f64( undef, undef) + call @llvm.minimum.nxv4f64( undef, undef) + call @llvm.minimum.nxv8f64( undef, undef) ret void } @@ -319,11 +319,11 @@ define void @minimum_f16() { call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) - call @llvm.minimum.nvx1f16( undef, undef) - call @llvm.minimum.nvx2f16( undef, undef) - call @llvm.minimum.nvx4f16( undef, undef) - call @llvm.minimum.nvx8f16( undef, undef) - call @llvm.minimum.nvx16f16( undef, undef) + call @llvm.minimum.nxv1f16( undef, undef) + call @llvm.minimum.nxv2f16( undef, undef) + call @llvm.minimum.nxv4f16( undef, undef) + call @llvm.minimum.nxv8f16( undef, undef) + call @llvm.minimum.nxv16f16( undef, undef) ret void } @@ -355,20 +355,20 @@ define void @maximum() { call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef) call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef) call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef) - call @llvm.maximum.nvx1f32( undef, undef) - call @llvm.maximum.nvx2f32( undef, undef) - call @llvm.maximum.nvx4f32( undef, undef) - call @llvm.maximum.nvx8f32( undef, undef) - call @llvm.maximum.nvx16f32( undef, undef) + call @llvm.maximum.nxv1f32( undef, undef) + call @llvm.maximum.nxv2f32( undef, undef) + call @llvm.maximum.nxv4f32( undef, undef) + call @llvm.maximum.nxv8f32( undef, undef) + call @llvm.maximum.nxv16f32( undef, undef) call double @llvm.maximum.f64(double undef, double undef) call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef) call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef) call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef) call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef) - call @llvm.maximum.nvx1f64( undef, undef) - call @llvm.maximum.nvx2f64( undef, undef) - call @llvm.maximum.nvx4f64( undef, undef) - call @llvm.maximum.nvx8f64( undef, undef) + call @llvm.maximum.nxv1f64( undef, undef) + call @llvm.maximum.nxv2f64( undef, undef) + call @llvm.maximum.nxv4f64( undef, undef) + call @llvm.maximum.nxv8f64( undef, undef) ret void } @@ -404,11 +404,11 @@ define void @maximum_f16() { call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) - call @llvm.maximum.nvx1f16( undef, undef) - call @llvm.maximum.nvx2f16( undef, undef) - call @llvm.maximum.nvx4f16( undef, undef) - call @llvm.maximum.nvx8f16( undef, undef) - call @llvm.maximum.nvx16f16( undef, undef) + call @llvm.maximum.nxv1f16( undef, undef) + call @llvm.maximum.nxv2f16( undef, undef) + call @llvm.maximum.nxv4f16( undef, undef) + call @llvm.maximum.nxv8f16( undef, undef) + call @llvm.maximum.nxv16f16( undef, undef) ret void } @@ -440,20 +440,20 @@ define void @copysign() { call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef) call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef) call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef) - call @llvm.copysign.nvx1f32( undef, undef) - call @llvm.copysign.nvx2f32( undef, undef) - call @llvm.copysign.nvx4f32( undef, undef) - call @llvm.copysign.nvx8f32( undef, undef) - call @llvm.copysign.nvx16f32( undef, undef) + call @llvm.copysign.nxv1f32( undef, undef) + call @llvm.copysign.nxv2f32( undef, undef) + call @llvm.copysign.nxv4f32( undef, undef) + call @llvm.copysign.nxv8f32( undef, undef) + call @llvm.copysign.nxv16f32( undef, undef) call double @llvm.copysign.f64(double undef, double undef) call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef) call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef) call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef) call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef) - call @llvm.copysign.nvx1f64( undef, undef) - call @llvm.copysign.nvx2f64( undef, undef) - call @llvm.copysign.nvx4f64( undef, undef) - call @llvm.copysign.nvx8f64( undef, undef) + call @llvm.copysign.nxv1f64( undef, undef) + call @llvm.copysign.nxv2f64( undef, undef) + call @llvm.copysign.nxv4f64( undef, undef) + call @llvm.copysign.nxv8f64( undef, undef) ret void } @@ -489,10 +489,10 @@ define void @copysign_f16() { call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef) call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef) call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef) - call @llvm.copysign.nvx1f16( undef, undef) - call @llvm.copysign.nvx2f16( undef, undef) - call @llvm.copysign.nvx4f16( undef, undef) - call @llvm.copysign.nvx8f16( undef, undef) - call @llvm.copysign.nvx16f16( undef, undef) + call @llvm.copysign.nxv1f16( undef, undef) + call @llvm.copysign.nxv2f16( undef, undef) + call @llvm.copysign.nxv4f16( undef, undef) + call @llvm.copysign.nxv8f16( undef, undef) + call @llvm.copysign.nxv16f16( undef, undef) ret void } diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll index efe17f2b76a70..be9c19dc59a85 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll @@ -39,29 +39,29 @@ define void @sqrt() { call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef) - call @llvm.sqrt.nvx2bf16( undef) - call @llvm.sqrt.nvx4bf16( undef) - call @llvm.sqrt.nvx8bf16( undef) - call @llvm.sqrt.nvx16bf16( undef) + call @llvm.sqrt.nxv2bf16( undef) + call @llvm.sqrt.nxv4bf16( undef) + call @llvm.sqrt.nxv8bf16( undef) + call @llvm.sqrt.nxv16bf16( undef) call float @llvm.sqrt.f32(float undef) call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef) call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef) call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef) call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef) - call @llvm.sqrt.nvx1f32( undef) - call @llvm.sqrt.nvx2f32( undef) - call @llvm.sqrt.nvx4f32( undef) - call @llvm.sqrt.nvx8f32( undef) - call @llvm.sqrt.nvx16f32( undef) + call @llvm.sqrt.nxv1f32( undef) + call @llvm.sqrt.nxv2f32( undef) + call @llvm.sqrt.nxv4f32( undef) + call @llvm.sqrt.nxv8f32( undef) + call @llvm.sqrt.nxv16f32( undef) call double @llvm.sqrt.f64(double undef) call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef) call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef) call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef) call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef) - call @llvm.sqrt.nvx1f64( undef) - call @llvm.sqrt.nvx2f64( undef) - call @llvm.sqrt.nvx4f64( undef) - call @llvm.sqrt.nvx8f64( undef) + call @llvm.sqrt.nxv1f64( undef) + call @llvm.sqrt.nxv2f64( undef) + call @llvm.sqrt.nxv4f64( undef) + call @llvm.sqrt.nxv8f64( undef) ret void } @@ -83,10 +83,10 @@ define void @sqrt_f16() { call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef) call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef) call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef) - call @llvm.sqrt.nvx2f16( undef) - call @llvm.sqrt.nvx4f16( undef) - call @llvm.sqrt.nvx8f16( undef) - call @llvm.sqrt.nvx16f16( undef) + call @llvm.sqrt.nxv2f16( undef) + call @llvm.sqrt.nxv4f16( undef) + call @llvm.sqrt.nxv8f16( undef) + call @llvm.sqrt.nxv16f16( undef) ret void } @@ -128,30 +128,30 @@ define void @pow() { call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef) call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef) call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef) - call @llvm.pow.nvx1bf16( undef, undef) - call @llvm.pow.nvx2bf16( undef, undef) - call @llvm.pow.nvx4bf16( undef, undef) - call @llvm.pow.nvx8bf16( undef, undef) - call @llvm.pow.nvx16bf16( undef, undef) + call @llvm.pow.nxv1bf16( undef, undef) + call @llvm.pow.nxv2bf16( undef, undef) + call @llvm.pow.nxv4bf16( undef, undef) + call @llvm.pow.nxv8bf16( undef, undef) + call @llvm.pow.nxv16bf16( undef, undef) call float @llvm.pow.f32(float undef, float undef) call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef) call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef) call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef) call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef) - call @llvm.pow.nvx1f32( undef, undef) - call @llvm.pow.nvx2f32( undef, undef) - call @llvm.pow.nvx4f32( undef, undef) - call @llvm.pow.nvx8f32( undef, undef) - call @llvm.pow.nvx16f32( undef, undef) + call @llvm.pow.nxv1f32( undef, undef) + call @llvm.pow.nxv2f32( undef, undef) + call @llvm.pow.nxv4f32( undef, undef) + call @llvm.pow.nxv8f32( undef, undef) + call @llvm.pow.nxv16f32( undef, undef) call double @llvm.pow.f64(double undef, double undef) call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef) call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef) call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef) call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef) - call @llvm.pow.nvx1f64( undef, undef) - call @llvm.pow.nvx2f64( undef, undef) - call @llvm.pow.nvx4f64( undef, undef) - call @llvm.pow.nvx8f64( undef, undef) + call @llvm.pow.nxv1f64( undef, undef) + call @llvm.pow.nxv2f64( undef, undef) + call @llvm.pow.nxv4f64( undef, undef) + call @llvm.pow.nxv8f64( undef, undef) ret void } @@ -187,10 +187,10 @@ define void @pow_f16() { call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef) call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef) call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef) - call @llvm.pow.nvx1f16( undef, undef) - call @llvm.pow.nvx2f16( undef, undef) - call @llvm.pow.nvx4f16( undef, undef) - call @llvm.pow.nvx8f16( undef, undef) - call @llvm.pow.nvx16f16( undef, undef) + call @llvm.pow.nxv1f16( undef, undef) + call @llvm.pow.nxv2f16( undef, undef) + call @llvm.pow.nxv4f16( undef, undef) + call @llvm.pow.nxv8f16( undef, undef) + call @llvm.pow.nxv16f16( undef, undef) ret void } diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll index 34d6c93f4577a..ba5e40ca03b88 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll @@ -40,30 +40,30 @@ define void @sin() { call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef) - call @llvm.sin.nvx1bf16( undef) - call @llvm.sin.nvx2bf16( undef) - call @llvm.sin.nvx4bf16( undef) - call @llvm.sin.nvx8bf16( undef) - call @llvm.sin.nvx16bf16( undef) + call @llvm.sin.nxv1bf16( undef) + call @llvm.sin.nxv2bf16( undef) + call @llvm.sin.nxv4bf16( undef) + call @llvm.sin.nxv8bf16( undef) + call @llvm.sin.nxv16bf16( undef) call float @llvm.sin.f32(float undef) call <2 x float> @llvm.sin.v2f32(<2 x float> undef) call <4 x float> @llvm.sin.v4f32(<4 x float> undef) call <8 x float> @llvm.sin.v8f32(<8 x float> undef) call <16 x float> @llvm.sin.v16f32(<16 x float> undef) - call @llvm.sin.nvx1f32( undef) - call @llvm.sin.nvx2f32( undef) - call @llvm.sin.nvx4f32( undef) - call @llvm.sin.nvx8f32( undef) - call @llvm.sin.nvx16f32( undef) + call @llvm.sin.nxv1f32( undef) + call @llvm.sin.nxv2f32( undef) + call @llvm.sin.nxv4f32( undef) + call @llvm.sin.nxv8f32( undef) + call @llvm.sin.nxv16f32( undef) call double @llvm.sin.f64(double undef) call <2 x double> @llvm.sin.v2f64(<2 x double> undef) call <4 x double> @llvm.sin.v4f64(<4 x double> undef) call <8 x double> @llvm.sin.v8f64(<8 x double> undef) call <16 x double> @llvm.sin.v16f64(<16 x double> undef) - call @llvm.sin.nvx1f64( undef) - call @llvm.sin.nvx2f64( undef) - call @llvm.sin.nvx4f64( undef) - call @llvm.sin.nvx8f64( undef) + call @llvm.sin.nxv1f64( undef) + call @llvm.sin.nxv2f64( undef) + call @llvm.sin.nxv4f64( undef) + call @llvm.sin.nxv8f64( undef) ret void } @@ -97,10 +97,10 @@ define void @sin_f16() { call <4 x half> @llvm.sin.v4f16(<4 x half> undef) call <8 x half> @llvm.sin.v8f16(<8 x half> undef) call <16 x half> @llvm.sin.v16f16(<16 x half> undef) - call @llvm.sin.nvx1f16( undef) - call @llvm.sin.nvx2f16( undef) - call @llvm.sin.nvx4f16( undef) - call @llvm.sin.nvx8f16( undef) + call @llvm.sin.nxv1f16( undef) + call @llvm.sin.nxv2f16( undef) + call @llvm.sin.nxv4f16( undef) + call @llvm.sin.nxv8f16( undef) ret void } @@ -142,30 +142,30 @@ define void @cos() { call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef) - call @llvm.cos.nvx1bf16( undef) - call @llvm.cos.nvx2bf16( undef) - call @llvm.cos.nvx4bf16( undef) - call @llvm.cos.nvx8bf16( undef) - call @llvm.cos.nvx16bf16( undef) + call @llvm.cos.nxv1bf16( undef) + call @llvm.cos.nxv2bf16( undef) + call @llvm.cos.nxv4bf16( undef) + call @llvm.cos.nxv8bf16( undef) + call @llvm.cos.nxv16bf16( undef) call float @llvm.cos.f32(float undef) call <2 x float> @llvm.cos.v2f32(<2 x float> undef) call <4 x float> @llvm.cos.v4f32(<4 x float> undef) call <8 x float> @llvm.cos.v8f32(<8 x float> undef) call <16 x float> @llvm.cos.v16f32(<16 x float> undef) - call @llvm.cos.nvx1f32( undef) - call @llvm.cos.nvx2f32( undef) - call @llvm.cos.nvx4f32( undef) - call @llvm.cos.nvx8f32( undef) - call @llvm.cos.nvx16f32( undef) + call @llvm.cos.nxv1f32( undef) + call @llvm.cos.nxv2f32( undef) + call @llvm.cos.nxv4f32( undef) + call @llvm.cos.nxv8f32( undef) + call @llvm.cos.nxv16f32( undef) call double @llvm.cos.f64(double undef) call <2 x double> @llvm.cos.v2f64(<2 x double> undef) call <4 x double> @llvm.cos.v4f64(<4 x double> undef) call <8 x double> @llvm.cos.v8f64(<8 x double> undef) call <16 x double> @llvm.cos.v16f64(<16 x double> undef) - call @llvm.cos.nvx1f64( undef) - call @llvm.cos.nvx2f64( undef) - call @llvm.cos.nvx4f64( undef) - call @llvm.cos.nvx8f64( undef) + call @llvm.cos.nxv1f64( undef) + call @llvm.cos.nxv2f64( undef) + call @llvm.cos.nxv4f64( undef) + call @llvm.cos.nxv8f64( undef) ret void } @@ -199,10 +199,10 @@ define void @cos_f16() { call <4 x half> @llvm.cos.v4f16(<4 x half> undef) call <8 x half> @llvm.cos.v8f16(<8 x half> undef) call <16 x half> @llvm.cos.v16f16(<16 x half> undef) - call @llvm.cos.nvx1f16( undef) - call @llvm.cos.nvx2f16( undef) - call @llvm.cos.nvx4f16( undef) - call @llvm.cos.nvx8f16( undef) + call @llvm.cos.nxv1f16( undef) + call @llvm.cos.nxv2f16( undef) + call @llvm.cos.nxv4f16( undef) + call @llvm.cos.nxv8f16( undef) ret void } @@ -244,30 +244,30 @@ define void @exp() { call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef) - call @llvm.exp.nvx1bf16( undef) - call @llvm.exp.nvx2bf16( undef) - call @llvm.exp.nvx4bf16( undef) - call @llvm.exp.nvx8bf16( undef) - call @llvm.exp.nvx16bf16( undef) + call @llvm.exp.nxv1bf16( undef) + call @llvm.exp.nxv2bf16( undef) + call @llvm.exp.nxv4bf16( undef) + call @llvm.exp.nxv8bf16( undef) + call @llvm.exp.nxv16bf16( undef) call float @llvm.exp.f32(float undef) call <2 x float> @llvm.exp.v2f32(<2 x float> undef) call <4 x float> @llvm.exp.v4f32(<4 x float> undef) call <8 x float> @llvm.exp.v8f32(<8 x float> undef) call <16 x float> @llvm.exp.v16f32(<16 x float> undef) - call @llvm.exp.nvx1f32( undef) - call @llvm.exp.nvx2f32( undef) - call @llvm.exp.nvx4f32( undef) - call @llvm.exp.nvx8f32( undef) - call @llvm.exp.nvx16f32( undef) + call @llvm.exp.nxv1f32( undef) + call @llvm.exp.nxv2f32( undef) + call @llvm.exp.nxv4f32( undef) + call @llvm.exp.nxv8f32( undef) + call @llvm.exp.nxv16f32( undef) call double @llvm.exp.f64(double undef) call <2 x double> @llvm.exp.v2f64(<2 x double> undef) call <4 x double> @llvm.exp.v4f64(<4 x double> undef) call <8 x double> @llvm.exp.v8f64(<8 x double> undef) call <16 x double> @llvm.exp.v16f64(<16 x double> undef) - call @llvm.exp.nvx1f64( undef) - call @llvm.exp.nvx2f64( undef) - call @llvm.exp.nvx4f64( undef) - call @llvm.exp.nvx8f64( undef) + call @llvm.exp.nxv1f64( undef) + call @llvm.exp.nxv2f64( undef) + call @llvm.exp.nxv4f64( undef) + call @llvm.exp.nxv8f64( undef) ret void } @@ -301,10 +301,10 @@ define void @exp_f16() { call <4 x half> @llvm.exp.v4f16(<4 x half> undef) call <8 x half> @llvm.exp.v8f16(<8 x half> undef) call <16 x half> @llvm.exp.v16f16(<16 x half> undef) - call @llvm.exp.nvx1f16( undef) - call @llvm.exp.nvx2f16( undef) - call @llvm.exp.nvx4f16( undef) - call @llvm.exp.nvx8f16( undef) + call @llvm.exp.nxv1f16( undef) + call @llvm.exp.nxv2f16( undef) + call @llvm.exp.nxv4f16( undef) + call @llvm.exp.nxv8f16( undef) ret void } @@ -346,30 +346,30 @@ define void @exp2() { call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef) - call @llvm.exp2.nvx1bf16( undef) - call @llvm.exp2.nvx2bf16( undef) - call @llvm.exp2.nvx4bf16( undef) - call @llvm.exp2.nvx8bf16( undef) - call @llvm.exp2.nvx16bf16( undef) + call @llvm.exp2.nxv1bf16( undef) + call @llvm.exp2.nxv2bf16( undef) + call @llvm.exp2.nxv4bf16( undef) + call @llvm.exp2.nxv8bf16( undef) + call @llvm.exp2.nxv16bf16( undef) call float @llvm.exp2.f32(float undef) call <2 x float> @llvm.exp2.v2f32(<2 x float> undef) call <4 x float> @llvm.exp2.v4f32(<4 x float> undef) call <8 x float> @llvm.exp2.v8f32(<8 x float> undef) call <16 x float> @llvm.exp2.v16f32(<16 x float> undef) - call @llvm.exp2.nvx1f32( undef) - call @llvm.exp2.nvx2f32( undef) - call @llvm.exp2.nvx4f32( undef) - call @llvm.exp2.nvx8f32( undef) - call @llvm.exp2.nvx16f32( undef) + call @llvm.exp2.nxv1f32( undef) + call @llvm.exp2.nxv2f32( undef) + call @llvm.exp2.nxv4f32( undef) + call @llvm.exp2.nxv8f32( undef) + call @llvm.exp2.nxv16f32( undef) call double @llvm.exp2.f64(double undef) call <2 x double> @llvm.exp2.v2f64(<2 x double> undef) call <4 x double> @llvm.exp2.v4f64(<4 x double> undef) call <8 x double> @llvm.exp2.v8f64(<8 x double> undef) call <16 x double> @llvm.exp2.v16f64(<16 x double> undef) - call @llvm.exp2.nvx1f64( undef) - call @llvm.exp2.nvx2f64( undef) - call @llvm.exp2.nvx4f64( undef) - call @llvm.exp2.nvx8f64( undef) + call @llvm.exp2.nxv1f64( undef) + call @llvm.exp2.nxv2f64( undef) + call @llvm.exp2.nxv4f64( undef) + call @llvm.exp2.nxv8f64( undef) ret void } @@ -403,10 +403,10 @@ define void @exp2_f16() { call <4 x half> @llvm.exp2.v4f16(<4 x half> undef) call <8 x half> @llvm.exp2.v8f16(<8 x half> undef) call <16 x half> @llvm.exp2.v16f16(<16 x half> undef) - call @llvm.exp2.nvx1f16( undef) - call @llvm.exp2.nvx2f16( undef) - call @llvm.exp2.nvx4f16( undef) - call @llvm.exp2.nvx8f16( undef) + call @llvm.exp2.nxv1f16( undef) + call @llvm.exp2.nxv2f16( undef) + call @llvm.exp2.nxv4f16( undef) + call @llvm.exp2.nxv8f16( undef) ret void } @@ -448,30 +448,30 @@ define void @log() { call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef) - call @llvm.log.nvx1bf16( undef) - call @llvm.log.nvx2bf16( undef) - call @llvm.log.nvx4bf16( undef) - call @llvm.log.nvx8bf16( undef) - call @llvm.log.nvx16bf16( undef) + call @llvm.log.nxv1bf16( undef) + call @llvm.log.nxv2bf16( undef) + call @llvm.log.nxv4bf16( undef) + call @llvm.log.nxv8bf16( undef) + call @llvm.log.nxv16bf16( undef) call float @llvm.log.f32(float undef) call <2 x float> @llvm.log.v2f32(<2 x float> undef) call <4 x float> @llvm.log.v4f32(<4 x float> undef) call <8 x float> @llvm.log.v8f32(<8 x float> undef) call <16 x float> @llvm.log.v16f32(<16 x float> undef) - call @llvm.log.nvx1f32( undef) - call @llvm.log.nvx2f32( undef) - call @llvm.log.nvx4f32( undef) - call @llvm.log.nvx8f32( undef) - call @llvm.log.nvx16f32( undef) + call @llvm.log.nxv1f32( undef) + call @llvm.log.nxv2f32( undef) + call @llvm.log.nxv4f32( undef) + call @llvm.log.nxv8f32( undef) + call @llvm.log.nxv16f32( undef) call double @llvm.log.f64(double undef) call <2 x double> @llvm.log.v2f64(<2 x double> undef) call <4 x double> @llvm.log.v4f64(<4 x double> undef) call <8 x double> @llvm.log.v8f64(<8 x double> undef) call <16 x double> @llvm.log.v16f64(<16 x double> undef) - call @llvm.log.nvx1f64( undef) - call @llvm.log.nvx2f64( undef) - call @llvm.log.nvx4f64( undef) - call @llvm.log.nvx8f64( undef) + call @llvm.log.nxv1f64( undef) + call @llvm.log.nxv2f64( undef) + call @llvm.log.nxv4f64( undef) + call @llvm.log.nxv8f64( undef) ret void } @@ -505,10 +505,10 @@ define void @log_f16() { call <4 x half> @llvm.log.v4f16(<4 x half> undef) call <8 x half> @llvm.log.v8f16(<8 x half> undef) call <16 x half> @llvm.log.v16f16(<16 x half> undef) - call @llvm.log.nvx1f16( undef) - call @llvm.log.nvx2f16( undef) - call @llvm.log.nvx4f16( undef) - call @llvm.log.nvx8f16( undef) + call @llvm.log.nxv1f16( undef) + call @llvm.log.nxv2f16( undef) + call @llvm.log.nxv4f16( undef) + call @llvm.log.nxv8f16( undef) ret void } @@ -550,30 +550,30 @@ define void @log10() { call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef) - call @llvm.log10.nvx1bf16( undef) - call @llvm.log10.nvx2bf16( undef) - call @llvm.log10.nvx4bf16( undef) - call @llvm.log10.nvx8bf16( undef) - call @llvm.log10.nvx16bf16( undef) + call @llvm.log10.nxv1bf16( undef) + call @llvm.log10.nxv2bf16( undef) + call @llvm.log10.nxv4bf16( undef) + call @llvm.log10.nxv8bf16( undef) + call @llvm.log10.nxv16bf16( undef) call float @llvm.log10.f32(float undef) call <2 x float> @llvm.log10.v2f32(<2 x float> undef) call <4 x float> @llvm.log10.v4f32(<4 x float> undef) call <8 x float> @llvm.log10.v8f32(<8 x float> undef) call <16 x float> @llvm.log10.v16f32(<16 x float> undef) - call @llvm.log10.nvx1f32( undef) - call @llvm.log10.nvx2f32( undef) - call @llvm.log10.nvx4f32( undef) - call @llvm.log10.nvx8f32( undef) - call @llvm.log10.nvx16f32( undef) + call @llvm.log10.nxv1f32( undef) + call @llvm.log10.nxv2f32( undef) + call @llvm.log10.nxv4f32( undef) + call @llvm.log10.nxv8f32( undef) + call @llvm.log10.nxv16f32( undef) call double @llvm.log10.f64(double undef) call <2 x double> @llvm.log10.v2f64(<2 x double> undef) call <4 x double> @llvm.log10.v4f64(<4 x double> undef) call <8 x double> @llvm.log10.v8f64(<8 x double> undef) call <16 x double> @llvm.log10.v16f64(<16 x double> undef) - call @llvm.log10.nvx1f64( undef) - call @llvm.log10.nvx2f64( undef) - call @llvm.log10.nvx4f64( undef) - call @llvm.log10.nvx8f64( undef) + call @llvm.log10.nxv1f64( undef) + call @llvm.log10.nxv2f64( undef) + call @llvm.log10.nxv4f64( undef) + call @llvm.log10.nxv8f64( undef) ret void } @@ -607,10 +607,10 @@ define void @log10_f16() { call <4 x half> @llvm.log10.v4f16(<4 x half> undef) call <8 x half> @llvm.log10.v8f16(<8 x half> undef) call <16 x half> @llvm.log10.v16f16(<16 x half> undef) - call @llvm.log10.nvx1f16( undef) - call @llvm.log10.nvx2f16( undef) - call @llvm.log10.nvx4f16( undef) - call @llvm.log10.nvx8f16( undef) + call @llvm.log10.nxv1f16( undef) + call @llvm.log10.nxv2f16( undef) + call @llvm.log10.nxv4f16( undef) + call @llvm.log10.nxv8f16( undef) ret void } @@ -652,30 +652,30 @@ define void @log2() { call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef) - call @llvm.log2.nvx1bf16( undef) - call @llvm.log2.nvx2bf16( undef) - call @llvm.log2.nvx4bf16( undef) - call @llvm.log2.nvx8bf16( undef) - call @llvm.log2.nvx16bf16( undef) + call @llvm.log2.nxv1bf16( undef) + call @llvm.log2.nxv2bf16( undef) + call @llvm.log2.nxv4bf16( undef) + call @llvm.log2.nxv8bf16( undef) + call @llvm.log2.nxv16bf16( undef) call float @llvm.log2.f32(float undef) call <2 x float> @llvm.log2.v2f32(<2 x float> undef) call <4 x float> @llvm.log2.v4f32(<4 x float> undef) call <8 x float> @llvm.log2.v8f32(<8 x float> undef) call <16 x float> @llvm.log2.v16f32(<16 x float> undef) - call @llvm.log2.nvx1f32( undef) - call @llvm.log2.nvx2f32( undef) - call @llvm.log2.nvx4f32( undef) - call @llvm.log2.nvx8f32( undef) - call @llvm.log2.nvx16f32( undef) + call @llvm.log2.nxv1f32( undef) + call @llvm.log2.nxv2f32( undef) + call @llvm.log2.nxv4f32( undef) + call @llvm.log2.nxv8f32( undef) + call @llvm.log2.nxv16f32( undef) call double @llvm.log2.f64(double undef) call <2 x double> @llvm.log2.v2f64(<2 x double> undef) call <4 x double> @llvm.log2.v4f64(<4 x double> undef) call <8 x double> @llvm.log2.v8f64(<8 x double> undef) call <16 x double> @llvm.log2.v16f64(<16 x double> undef) - call @llvm.log2.nvx1f64( undef) - call @llvm.log2.nvx2f64( undef) - call @llvm.log2.nvx4f64( undef) - call @llvm.log2.nvx8f64( undef) + call @llvm.log2.nxv1f64( undef) + call @llvm.log2.nxv2f64( undef) + call @llvm.log2.nxv4f64( undef) + call @llvm.log2.nxv8f64( undef) ret void } @@ -709,10 +709,10 @@ define void @log2_f16() { call <4 x half> @llvm.log2.v4f16(<4 x half> undef) call <8 x half> @llvm.log2.v8f16(<8 x half> undef) call <16 x half> @llvm.log2.v16f16(<16 x half> undef) - call @llvm.log2.nvx1f16( undef) - call @llvm.log2.nvx2f16( undef) - call @llvm.log2.nvx4f16( undef) - call @llvm.log2.nvx8f16( undef) + call @llvm.log2.nxv1f16( undef) + call @llvm.log2.nxv2f16( undef) + call @llvm.log2.nxv4f16( undef) + call @llvm.log2.nxv8f16( undef) ret void } diff --git a/llvm/test/Analysis/CostModel/RISCV/fround.ll b/llvm/test/Analysis/CostModel/RISCV/fround.ll index c6826760a45be..a0818d487d151 100644 --- a/llvm/test/Analysis/CostModel/RISCV/fround.ll +++ b/llvm/test/Analysis/CostModel/RISCV/fround.ll @@ -40,30 +40,30 @@ define void @floor() { call <4 x bfloat> @llvm.floor.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.floor.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.floor.v16bf16(<16 x bfloat> undef) - call @llvm.floor.nvx1bf16( undef) - call @llvm.floor.nvx2bf16( undef) - call @llvm.floor.nvx4bf16( undef) - call @llvm.floor.nvx8bf16( undef) - call @llvm.floor.nvx16bf16( undef) + call @llvm.floor.nxv1bf16( undef) + call @llvm.floor.nxv2bf16( undef) + call @llvm.floor.nxv4bf16( undef) + call @llvm.floor.nxv8bf16( undef) + call @llvm.floor.nxv16bf16( undef) call float @llvm.floor.f32(float undef) call <2 x float> @llvm.floor.v2f32(<2 x float> undef) call <4 x float> @llvm.floor.v4f32(<4 x float> undef) call <8 x float> @llvm.floor.v8f32(<8 x float> undef) call <16 x float> @llvm.floor.v16f32(<16 x float> undef) - call @llvm.floor.nvx1f32( undef) - call @llvm.floor.nvx2f32( undef) - call @llvm.floor.nvx4f32( undef) - call @llvm.floor.nvx8f32( undef) - call @llvm.floor.nvx16f32( undef) + call @llvm.floor.nxv1f32( undef) + call @llvm.floor.nxv2f32( undef) + call @llvm.floor.nxv4f32( undef) + call @llvm.floor.nxv8f32( undef) + call @llvm.floor.nxv16f32( undef) call double @llvm.floor.f64(double undef) call <2 x double> @llvm.floor.v2f64(<2 x double> undef) call <4 x double> @llvm.floor.v4f64(<4 x double> undef) call <8 x double> @llvm.floor.v8f64(<8 x double> undef) call <16 x double> @llvm.floor.v16f64(<16 x double> undef) - call @llvm.floor.nvx1f64( undef) - call @llvm.floor.nvx2f64( undef) - call @llvm.floor.nvx4f64( undef) - call @llvm.floor.nvx8f64( undef) + call @llvm.floor.nxv1f64( undef) + call @llvm.floor.nxv2f64( undef) + call @llvm.floor.nxv4f64( undef) + call @llvm.floor.nxv8f64( undef) ret void } @@ -99,11 +99,11 @@ define void @floor_fp16() { call <4 x half> @llvm.floor.v4f16(<4 x half> undef) call <8 x half> @llvm.floor.v8f16(<8 x half> undef) call <16 x half> @llvm.floor.v16f16(<16 x half> undef) - call @llvm.floor.nvx1f16( undef) - call @llvm.floor.nvx2f16( undef) - call @llvm.floor.nvx4f16( undef) - call @llvm.floor.nvx8f16( undef) - call @llvm.floor.nvx16f16( undef) + call @llvm.floor.nxv1f16( undef) + call @llvm.floor.nxv2f16( undef) + call @llvm.floor.nxv4f16( undef) + call @llvm.floor.nxv8f16( undef) + call @llvm.floor.nxv16f16( undef) ret void } @@ -145,30 +145,30 @@ define void @ceil() { call <4 x bfloat> @llvm.ceil.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.ceil.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.ceil.v16bf16(<16 x bfloat> undef) - call @llvm.ceil.nvx1bf16( undef) - call @llvm.ceil.nvx2bf16( undef) - call @llvm.ceil.nvx4bf16( undef) - call @llvm.ceil.nvx8bf16( undef) - call @llvm.ceil.nvx16bf16( undef) + call @llvm.ceil.nxv1bf16( undef) + call @llvm.ceil.nxv2bf16( undef) + call @llvm.ceil.nxv4bf16( undef) + call @llvm.ceil.nxv8bf16( undef) + call @llvm.ceil.nxv16bf16( undef) call float @llvm.ceil.f32(float undef) call <2 x float> @llvm.ceil.v2f32(<2 x float> undef) call <4 x float> @llvm.ceil.v4f32(<4 x float> undef) call <8 x float> @llvm.ceil.v8f32(<8 x float> undef) call <16 x float> @llvm.ceil.v16f32(<16 x float> undef) - call @llvm.ceil.nvx1f32( undef) - call @llvm.ceil.nvx2f32( undef) - call @llvm.ceil.nvx4f32( undef) - call @llvm.ceil.nvx8f32( undef) - call @llvm.ceil.nvx16f32( undef) + call @llvm.ceil.nxv1f32( undef) + call @llvm.ceil.nxv2f32( undef) + call @llvm.ceil.nxv4f32( undef) + call @llvm.ceil.nxv8f32( undef) + call @llvm.ceil.nxv16f32( undef) call double @llvm.ceil.f64(double undef) call <2 x double> @llvm.ceil.v2f64(<2 x double> undef) call <4 x double> @llvm.ceil.v4f64(<4 x double> undef) call <8 x double> @llvm.ceil.v8f64(<8 x double> undef) call <16 x double> @llvm.ceil.v16f64(<16 x double> undef) - call @llvm.ceil.nvx1f64( undef) - call @llvm.ceil.nvx2f64( undef) - call @llvm.ceil.nvx4f64( undef) - call @llvm.ceil.nvx8f64( undef) + call @llvm.ceil.nxv1f64( undef) + call @llvm.ceil.nxv2f64( undef) + call @llvm.ceil.nxv4f64( undef) + call @llvm.ceil.nxv8f64( undef) ret void } @@ -204,11 +204,11 @@ define void @ceil_fp16() { call <4 x half> @llvm.ceil.v4f16(<4 x half> undef) call <8 x half> @llvm.ceil.v8f16(<8 x half> undef) call <16 x half> @llvm.ceil.v16f16(<16 x half> undef) - call @llvm.ceil.nvx1f16( undef) - call @llvm.ceil.nvx2f16( undef) - call @llvm.ceil.nvx4f16( undef) - call @llvm.ceil.nvx8f16( undef) - call @llvm.ceil.nvx16f16( undef) + call @llvm.ceil.nxv1f16( undef) + call @llvm.ceil.nxv2f16( undef) + call @llvm.ceil.nxv4f16( undef) + call @llvm.ceil.nxv8f16( undef) + call @llvm.ceil.nxv16f16( undef) ret void } @@ -250,30 +250,30 @@ define void @trunc() { call <4 x bfloat> @llvm.trunc.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.trunc.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.trunc.v16bf16(<16 x bfloat> undef) - call @llvm.trunc.nvx1bf16( undef) - call @llvm.trunc.nvx2bf16( undef) - call @llvm.trunc.nvx4bf16( undef) - call @llvm.trunc.nvx8bf16( undef) - call @llvm.trunc.nvx16bf16( undef) + call @llvm.trunc.nxv1bf16( undef) + call @llvm.trunc.nxv2bf16( undef) + call @llvm.trunc.nxv4bf16( undef) + call @llvm.trunc.nxv8bf16( undef) + call @llvm.trunc.nxv16bf16( undef) call float @llvm.trunc.f32(float undef) call <2 x float> @llvm.trunc.v2f32(<2 x float> undef) call <4 x float> @llvm.trunc.v4f32(<4 x float> undef) call <8 x float> @llvm.trunc.v8f32(<8 x float> undef) call <16 x float> @llvm.trunc.v16f32(<16 x float> undef) - call @llvm.trunc.nvx1f32( undef) - call @llvm.trunc.nvx2f32( undef) - call @llvm.trunc.nvx4f32( undef) - call @llvm.trunc.nvx8f32( undef) - call @llvm.trunc.nvx16f32( undef) + call @llvm.trunc.nxv1f32( undef) + call @llvm.trunc.nxv2f32( undef) + call @llvm.trunc.nxv4f32( undef) + call @llvm.trunc.nxv8f32( undef) + call @llvm.trunc.nxv16f32( undef) call double @llvm.trunc.f64(double undef) call <2 x double> @llvm.trunc.v2f64(<2 x double> undef) call <4 x double> @llvm.trunc.v4f64(<4 x double> undef) call <8 x double> @llvm.trunc.v8f64(<8 x double> undef) call <16 x double> @llvm.trunc.v16f64(<16 x double> undef) - call @llvm.trunc.nvx1f64( undef) - call @llvm.trunc.nvx2f64( undef) - call @llvm.trunc.nvx4f64( undef) - call @llvm.trunc.nvx8f64( undef) + call @llvm.trunc.nxv1f64( undef) + call @llvm.trunc.nxv2f64( undef) + call @llvm.trunc.nxv4f64( undef) + call @llvm.trunc.nxv8f64( undef) ret void } @@ -309,11 +309,11 @@ define void @trunc_fp16() { call <4 x half> @llvm.trunc.v4f16(<4 x half> undef) call <8 x half> @llvm.trunc.v8f16(<8 x half> undef) call <16 x half> @llvm.trunc.v16f16(<16 x half> undef) - call @llvm.trunc.nvx1f16( undef) - call @llvm.trunc.nvx2f16( undef) - call @llvm.trunc.nvx4f16( undef) - call @llvm.trunc.nvx8f16( undef) - call @llvm.trunc.nvx16f16( undef) + call @llvm.trunc.nxv1f16( undef) + call @llvm.trunc.nxv2f16( undef) + call @llvm.trunc.nxv4f16( undef) + call @llvm.trunc.nxv8f16( undef) + call @llvm.trunc.nxv16f16( undef) ret void } @@ -355,30 +355,30 @@ define void @rint() { call <4 x bfloat> @llvm.rint.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.rint.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.rint.v16bf16(<16 x bfloat> undef) - call @llvm.rint.nvx1bf16( undef) - call @llvm.rint.nvx2bf16( undef) - call @llvm.rint.nvx4bf16( undef) - call @llvm.rint.nvx8bf16( undef) - call @llvm.rint.nvx16bf16( undef) + call @llvm.rint.nxv1bf16( undef) + call @llvm.rint.nxv2bf16( undef) + call @llvm.rint.nxv4bf16( undef) + call @llvm.rint.nxv8bf16( undef) + call @llvm.rint.nxv16bf16( undef) call float @llvm.rint.f32(float undef) call <2 x float> @llvm.rint.v2f32(<2 x float> undef) call <4 x float> @llvm.rint.v4f32(<4 x float> undef) call <8 x float> @llvm.rint.v8f32(<8 x float> undef) call <16 x float> @llvm.rint.v16f32(<16 x float> undef) - call @llvm.rint.nvx1f32( undef) - call @llvm.rint.nvx2f32( undef) - call @llvm.rint.nvx4f32( undef) - call @llvm.rint.nvx8f32( undef) - call @llvm.rint.nvx16f32( undef) + call @llvm.rint.nxv1f32( undef) + call @llvm.rint.nxv2f32( undef) + call @llvm.rint.nxv4f32( undef) + call @llvm.rint.nxv8f32( undef) + call @llvm.rint.nxv16f32( undef) call double @llvm.rint.f64(double undef) call <2 x double> @llvm.rint.v2f64(<2 x double> undef) call <4 x double> @llvm.rint.v4f64(<4 x double> undef) call <8 x double> @llvm.rint.v8f64(<8 x double> undef) call <16 x double> @llvm.rint.v16f64(<16 x double> undef) - call @llvm.rint.nvx1f64( undef) - call @llvm.rint.nvx2f64( undef) - call @llvm.rint.nvx4f64( undef) - call @llvm.rint.nvx8f64( undef) + call @llvm.rint.nxv1f64( undef) + call @llvm.rint.nxv2f64( undef) + call @llvm.rint.nxv4f64( undef) + call @llvm.rint.nxv8f64( undef) ret void } @@ -414,26 +414,26 @@ define void @rint_fp16() { call <4 x half> @llvm.rint.v4f16(<4 x half> undef) call <8 x half> @llvm.rint.v8f16(<8 x half> undef) call <16 x half> @llvm.rint.v16f16(<16 x half> undef) - call @llvm.rint.nvx1f16( undef) - call @llvm.rint.nvx2f16( undef) - call @llvm.rint.nvx4f16( undef) - call @llvm.rint.nvx8f16( undef) - call @llvm.rint.nvx16f16( undef) + call @llvm.rint.nxv1f16( undef) + call @llvm.rint.nxv2f16( undef) + call @llvm.rint.nxv4f16( undef) + call @llvm.rint.nxv8f16( undef) + call @llvm.rint.nxv16f16( undef) ret void } define void @lrint() { ; CHECK-LABEL: 'lrint' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call i64 @llvm.lrint.i64.bf16(bfloat undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x i64> @llvm.lrint.v2i64.v2bf16(<2 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x i64> @llvm.lrint.v4i64.v4bf16(<4 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x i64> @llvm.lrint.v8i64.v8bf16(<8 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x i64> @llvm.lrint.v16i64.v16bf16(<16 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.lrint.nxv1i64.nxv1bf16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.lrint.nxv2i64.nxv2bf16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call @llvm.lrint.nxv4i64.nxv4bf16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call @llvm.lrint.nxv8i64.nxv8bf16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.lrint.nxv16i64.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %2 = call <2 x i64> @llvm.lrint.v2i64.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %3 = call <4 x i64> @llvm.lrint.v4i64.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %4 = call <8 x i64> @llvm.lrint.v8i64.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %5 = call <16 x i64> @llvm.lrint.v16i64.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.lrint.nxv1i64.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.lrint.nxv2i64.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.lrint.nxv4i64.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.lrint.nxv8i64.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.lrint.nxv16i64.nxv16bf16( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call i64 @llvm.lrint.i64.f32(float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> undef) @@ -460,45 +460,45 @@ define void @lrint() { call <4 x i64> @llvm.lrint.v4i64.v4bf16(<4 x bfloat> undef) call <8 x i64> @llvm.lrint.v8i64.v8bf16(<8 x bfloat> undef) call <16 x i64> @llvm.lrint.v16i64.v16bf16(<16 x bfloat> undef) - call @llvm.lrint.nvx1i64.nvx1bf16( undef) - call @llvm.lrint.nvx2i64.nvx2bf16( undef) - call @llvm.lrint.nvx4i64.nvx4bf16( undef) - call @llvm.lrint.nvx8i64.nvx8bf16( undef) - call @llvm.lrint.nvx16i64.nvx16bf16( undef) + call @llvm.lrint.nxv1i64.nxv1bf16( undef) + call @llvm.lrint.nxv2i64.nxv2bf16( undef) + call @llvm.lrint.nxv4i64.nxv4bf16( undef) + call @llvm.lrint.nxv8i64.nxv8bf16( undef) + call @llvm.lrint.nxv16i64.nxv16bf16( undef) call i64 @llvm.lrint.i64.f32(float undef) call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> undef) call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> undef) call <8 x i64> @llvm.lrint.v8i64.v8f32(<8 x float> undef) call <16 x i64> @llvm.lrint.v16i64.v16f32(<16 x float> undef) - call @llvm.lrint.nvx1i64.nvx1f32( undef) - call @llvm.lrint.nvx2i64.nvx2f32( undef) - call @llvm.lrint.nvx4i64.nvx4f32( undef) - call @llvm.lrint.nvx8i64.nvx8f32( undef) - call @llvm.lrint.nvx16i64.nvx16f32( undef) + call @llvm.lrint.nxv1i64.nxv1f32( undef) + call @llvm.lrint.nxv2i64.nxv2f32( undef) + call @llvm.lrint.nxv4i64.nxv4f32( undef) + call @llvm.lrint.nxv8i64.nxv8f32( undef) + call @llvm.lrint.nxv16i64.nxv16f32( undef) call i64 @llvm.lrint.i64.f64(double undef) call <2 x i64> @llvm.lrint.v2i64.v2f64(<2 x double> undef) call <4 x i64> @llvm.lrint.v4i64.v4f64(<4 x double> undef) call <8 x i64> @llvm.lrint.v8i64.v8f64(<8 x double> undef) call <16 x i64> @llvm.lrint.v16i64.v16f64(<16 x double> undef) - call @llvm.lrint.nvx1i64.nvx1f64( undef) - call @llvm.lrint.nvx2i64.nvx2f64( undef) - call @llvm.lrint.nvx4i64.nvx4f64( undef) - call @llvm.lrint.nvx8i64.nvx8f64( undef) + call @llvm.lrint.nxv1i64.nxv1f64( undef) + call @llvm.lrint.nxv2i64.nxv2f64( undef) + call @llvm.lrint.nxv4i64.nxv4f64( undef) + call @llvm.lrint.nxv8i64.nxv8f64( undef) ret void } define void @lrint_fp16() { ; CHECK-LABEL: 'lrint_fp16' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call i64 @llvm.lrint.i64.f16(half undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x i64> @llvm.lrint.v2i64.v2f16(<2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x i64> @llvm.lrint.v4i64.v4f16(<4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x i64> @llvm.lrint.v8i64.v8f16(<8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x i64> @llvm.lrint.v16i64.v16f16(<16 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.lrint.nxv1i64.nxv1f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.lrint.nxv2i64.nxv2f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call @llvm.lrint.nxv4i64.nxv4f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call @llvm.lrint.nxv8i64.nxv8f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.lrint.nxv16i64.nxv16f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %2 = call <2 x i64> @llvm.lrint.v2i64.v2f16(<2 x half> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %3 = call <4 x i64> @llvm.lrint.v4i64.v4f16(<4 x half> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %4 = call <8 x i64> @llvm.lrint.v8i64.v8f16(<8 x half> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %5 = call <16 x i64> @llvm.lrint.v16i64.v16f16(<16 x half> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.lrint.nxv1i64.nxv1f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.lrint.nxv2i64.nxv2f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.lrint.nxv4i64.nxv4f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.lrint.nxv8i64.nxv8f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.lrint.nxv16i64.nxv16f16( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; call i64 @llvm.lrint.f16(half undef) @@ -506,26 +506,26 @@ define void @lrint_fp16() { call <4 x i64> @llvm.lrint.v4f16(<4 x half> undef) call <8 x i64> @llvm.lrint.v8f16(<8 x half> undef) call <16 x i64> @llvm.lrint.v16f16(<16 x half> undef) - call @llvm.lrint.nvx1f16( undef) - call @llvm.lrint.nvx2f16( undef) - call @llvm.lrint.nvx4f16( undef) - call @llvm.lrint.nvx8f16( undef) - call @llvm.lrint.nvx16f16( undef) + call @llvm.lrint.nxv1f16( undef) + call @llvm.lrint.nxv2f16( undef) + call @llvm.lrint.nxv4f16( undef) + call @llvm.lrint.nxv8f16( undef) + call @llvm.lrint.nxv16f16( undef) ret void } define void @llrint() { ; CHECK-LABEL: 'llrint' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call i64 @llvm.llrint.i64.bf16(bfloat undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x i64> @llvm.llrint.v2i64.v2bf16(<2 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x i64> @llvm.llrint.v4i64.v4bf16(<4 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x i64> @llvm.llrint.v8i64.v8bf16(<8 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x i64> @llvm.llrint.v16i64.v16bf16(<16 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.llrint.nxv1i64.nxv1bf16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.llrint.nxv2i64.nxv2bf16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call @llvm.llrint.nxv4i64.nxv4bf16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call @llvm.llrint.nxv8i64.nxv8bf16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.llrint.nxv16i64.nxv16bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %2 = call <2 x i64> @llvm.llrint.v2i64.v2bf16(<2 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %3 = call <4 x i64> @llvm.llrint.v4i64.v4bf16(<4 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %4 = call <8 x i64> @llvm.llrint.v8i64.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %5 = call <16 x i64> @llvm.llrint.v16i64.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.llrint.nxv1i64.nxv1bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.llrint.nxv2i64.nxv2bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.llrint.nxv4i64.nxv4bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.llrint.nxv8i64.nxv8bf16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.llrint.nxv16i64.nxv16bf16( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call i64 @llvm.llrint.i64.f32(float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef) @@ -552,45 +552,45 @@ define void @llrint() { call <4 x i64> @llvm.llrint.v4i64.v4bf16(<4 x bfloat> undef) call <8 x i64> @llvm.llrint.v8i64.v8bf16(<8 x bfloat> undef) call <16 x i64> @llvm.llrint.v16i64.v16bf16(<16 x bfloat> undef) - call @llvm.llrint.nvx1i64.nvx1bf16( undef) - call @llvm.llrint.nvx2i64.nvx2bf16( undef) - call @llvm.llrint.nvx4i64.nvx4bf16( undef) - call @llvm.llrint.nvx8i64.nvx8bf16( undef) - call @llvm.llrint.nvx16i64.nvx16bf16( undef) + call @llvm.llrint.nxv1i64.nxv1bf16( undef) + call @llvm.llrint.nxv2i64.nxv2bf16( undef) + call @llvm.llrint.nxv4i64.nxv4bf16( undef) + call @llvm.llrint.nxv8i64.nxv8bf16( undef) + call @llvm.llrint.nxv16i64.nxv16bf16( undef) call i64 @llvm.llrint.i64.f32(float undef) call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> undef) call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> undef) call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> undef) call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> undef) - call @llvm.llrint.nvx1i64.nvx1f32( undef) - call @llvm.llrint.nvx2i64.nvx2f32( undef) - call @llvm.llrint.nvx4i64.nvx4f32( undef) - call @llvm.llrint.nvx8i64.nvx8f32( undef) - call @llvm.llrint.nvx16i64.nvx16f32( undef) + call @llvm.llrint.nxv1i64.nxv1f32( undef) + call @llvm.llrint.nxv2i64.nxv2f32( undef) + call @llvm.llrint.nxv4i64.nxv4f32( undef) + call @llvm.llrint.nxv8i64.nxv8f32( undef) + call @llvm.llrint.nxv16i64.nxv16f32( undef) call i64 @llvm.llrint.i64.f64(double undef) call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> undef) call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> undef) call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> undef) call <16 x i64> @llvm.llrint.v16i64.v16f64(<16 x double> undef) - call @llvm.llrint.nvx1i64.nvx1f64( undef) - call @llvm.llrint.nvx2i64.nvx2f64( undef) - call @llvm.llrint.nvx4i64.nvx4f64( undef) - call @llvm.llrint.nvx8i64.nvx8f64( undef) + call @llvm.llrint.nxv1i64.nxv1f64( undef) + call @llvm.llrint.nxv2i64.nxv2f64( undef) + call @llvm.llrint.nxv4i64.nxv4f64( undef) + call @llvm.llrint.nxv8i64.nxv8f64( undef) ret void } define void @llrint_fp16() { ; CHECK-LABEL: 'llrint_fp16' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call i64 @llvm.llrint.i64.f16(half undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x i64> @llvm.llrint.v2i64.v2f16(<2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call @llvm.llrint.nxv1i64.nxv1f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call @llvm.llrint.nxv2i64.nxv2f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call @llvm.llrint.nxv4i64.nxv4f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call @llvm.llrint.nxv8i64.nxv8f16( undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %10 = call @llvm.llrint.nxv16i64.nxv16f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %2 = call <2 x i64> @llvm.llrint.v2i64.v2f16(<2 x half> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %3 = call <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %4 = call <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %5 = call <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half> undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %6 = call @llvm.llrint.nxv1i64.nxv1f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %7 = call @llvm.llrint.nxv2i64.nxv2f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %8 = call @llvm.llrint.nxv4i64.nxv4f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %9 = call @llvm.llrint.nxv8i64.nxv8f16( undef) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %10 = call @llvm.llrint.nxv16i64.nxv16f16( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; call i64 @llvm.llrint.f16(half undef) @@ -598,11 +598,11 @@ define void @llrint_fp16() { call <4 x i64> @llvm.llrint.v4f16(<4 x half> undef) call <8 x i64> @llvm.llrint.v8f16(<8 x half> undef) call <16 x i64> @llvm.llrint.v16f16(<16 x half> undef) - call @llvm.llrint.nvx1f16( undef) - call @llvm.llrint.nvx2f16( undef) - call @llvm.llrint.nvx4f16( undef) - call @llvm.llrint.nvx8f16( undef) - call @llvm.llrint.nvx16f16( undef) + call @llvm.llrint.nxv1f16( undef) + call @llvm.llrint.nxv2f16( undef) + call @llvm.llrint.nxv4f16( undef) + call @llvm.llrint.nxv8f16( undef) + call @llvm.llrint.nxv16f16( undef) ret void } @@ -644,30 +644,30 @@ define void @nearbyint() { call <4 x bfloat> @llvm.nearbyint.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.nearbyint.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.nearbyint.v16bf16(<16 x bfloat> undef) - call @llvm.nearbyint.nvx1bf16( undef) - call @llvm.nearbyint.nvx2bf16( undef) - call @llvm.nearbyint.nvx4bf16( undef) - call @llvm.nearbyint.nvx8bf16( undef) - call @llvm.nearbyint.nvx16bf16( undef) + call @llvm.nearbyint.nxv1bf16( undef) + call @llvm.nearbyint.nxv2bf16( undef) + call @llvm.nearbyint.nxv4bf16( undef) + call @llvm.nearbyint.nxv8bf16( undef) + call @llvm.nearbyint.nxv16bf16( undef) call float @llvm.nearbyint.f32(float undef) call <2 x float> @llvm.nearbyint.v2f32(<2 x float> undef) call <4 x float> @llvm.nearbyint.v4f32(<4 x float> undef) call <8 x float> @llvm.nearbyint.v8f32(<8 x float> undef) call <16 x float> @llvm.nearbyint.v16f32(<16 x float> undef) - call @llvm.nearbyint.nvx1f32( undef) - call @llvm.nearbyint.nvx2f32( undef) - call @llvm.nearbyint.nvx4f32( undef) - call @llvm.nearbyint.nvx8f32( undef) - call @llvm.nearbyint.nvx16f32( undef) + call @llvm.nearbyint.nxv1f32( undef) + call @llvm.nearbyint.nxv2f32( undef) + call @llvm.nearbyint.nxv4f32( undef) + call @llvm.nearbyint.nxv8f32( undef) + call @llvm.nearbyint.nxv16f32( undef) call double @llvm.nearbyint.f64(double undef) call <2 x double> @llvm.nearbyint.v2f64(<2 x double> undef) call <4 x double> @llvm.nearbyint.v4f64(<4 x double> undef) call <8 x double> @llvm.nearbyint.v8f64(<8 x double> undef) call <16 x double> @llvm.nearbyint.v16f64(<16 x double> undef) - call @llvm.nearbyint.nvx1f64( undef) - call @llvm.nearbyint.nvx2f64( undef) - call @llvm.nearbyint.nvx4f64( undef) - call @llvm.nearbyint.nvx8f64( undef) + call @llvm.nearbyint.nxv1f64( undef) + call @llvm.nearbyint.nxv2f64( undef) + call @llvm.nearbyint.nxv4f64( undef) + call @llvm.nearbyint.nxv8f64( undef) ret void } @@ -703,11 +703,11 @@ define void @nearbyint_fp16() { call <4 x half> @llvm.nearbyint.v4f16(<4 x half> undef) call <8 x half> @llvm.nearbyint.v8f16(<8 x half> undef) call <16 x half> @llvm.nearbyint.v16f16(<16 x half> undef) - call @llvm.nearbyint.nvx1f16( undef) - call @llvm.nearbyint.nvx2f16( undef) - call @llvm.nearbyint.nvx4f16( undef) - call @llvm.nearbyint.nvx8f16( undef) - call @llvm.nearbyint.nvx16f16( undef) + call @llvm.nearbyint.nxv1f16( undef) + call @llvm.nearbyint.nxv2f16( undef) + call @llvm.nearbyint.nxv4f16( undef) + call @llvm.nearbyint.nxv8f16( undef) + call @llvm.nearbyint.nxv16f16( undef) ret void } @@ -749,30 +749,30 @@ define void @round() { call <4 x bfloat> @llvm.round.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.round.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.round.v16bf16(<16 x bfloat> undef) - call @llvm.round.nvx1bf16( undef) - call @llvm.round.nvx2bf16( undef) - call @llvm.round.nvx4bf16( undef) - call @llvm.round.nvx8bf16( undef) - call @llvm.round.nvx16bf16( undef) + call @llvm.round.nxv1bf16( undef) + call @llvm.round.nxv2bf16( undef) + call @llvm.round.nxv4bf16( undef) + call @llvm.round.nxv8bf16( undef) + call @llvm.round.nxv16bf16( undef) call float @llvm.round.f32(float undef) call <2 x float> @llvm.round.v2f32(<2 x float> undef) call <4 x float> @llvm.round.v4f32(<4 x float> undef) call <8 x float> @llvm.round.v8f32(<8 x float> undef) call <16 x float> @llvm.round.v16f32(<16 x float> undef) - call @llvm.round.nvx1f32( undef) - call @llvm.round.nvx2f32( undef) - call @llvm.round.nvx4f32( undef) - call @llvm.round.nvx8f32( undef) - call @llvm.round.nvx16f32( undef) + call @llvm.round.nxv1f32( undef) + call @llvm.round.nxv2f32( undef) + call @llvm.round.nxv4f32( undef) + call @llvm.round.nxv8f32( undef) + call @llvm.round.nxv16f32( undef) call double @llvm.round.f64(double undef) call <2 x double> @llvm.round.v2f64(<2 x double> undef) call <4 x double> @llvm.round.v4f64(<4 x double> undef) call <8 x double> @llvm.round.v8f64(<8 x double> undef) call <16 x double> @llvm.round.v16f64(<16 x double> undef) - call @llvm.round.nvx1f64( undef) - call @llvm.round.nvx2f64( undef) - call @llvm.round.nvx4f64( undef) - call @llvm.round.nvx8f64( undef) + call @llvm.round.nxv1f64( undef) + call @llvm.round.nxv2f64( undef) + call @llvm.round.nxv4f64( undef) + call @llvm.round.nxv8f64( undef) ret void } @@ -808,11 +808,11 @@ define void @round_fp16() { call <4 x half> @llvm.round.v4f16(<4 x half> undef) call <8 x half> @llvm.round.v8f16(<8 x half> undef) call <16 x half> @llvm.round.v16f16(<16 x half> undef) - call @llvm.round.nvx1f16( undef) - call @llvm.round.nvx2f16( undef) - call @llvm.round.nvx4f16( undef) - call @llvm.round.nvx8f16( undef) - call @llvm.round.nvx16f16( undef) + call @llvm.round.nxv1f16( undef) + call @llvm.round.nxv2f16( undef) + call @llvm.round.nxv4f16( undef) + call @llvm.round.nxv8f16( undef) + call @llvm.round.nxv16f16( undef) ret void } @@ -854,30 +854,30 @@ define void @roundeven() { call <4 x bfloat> @llvm.roundeven.v4bf16(<4 x bfloat> undef) call <8 x bfloat> @llvm.roundeven.v8bf16(<8 x bfloat> undef) call <16 x bfloat> @llvm.roundeven.v16bf16(<16 x bfloat> undef) - call @llvm.roundeven.nvx1bf16( undef) - call @llvm.roundeven.nvx2bf16( undef) - call @llvm.roundeven.nvx4bf16( undef) - call @llvm.roundeven.nvx8bf16( undef) - call @llvm.roundeven.nvx16bf16( undef) + call @llvm.roundeven.nxv1bf16( undef) + call @llvm.roundeven.nxv2bf16( undef) + call @llvm.roundeven.nxv4bf16( undef) + call @llvm.roundeven.nxv8bf16( undef) + call @llvm.roundeven.nxv16bf16( undef) call float @llvm.roundeven.f32(float undef) call <2 x float> @llvm.roundeven.v2f32(<2 x float> undef) call <4 x float> @llvm.roundeven.v4f32(<4 x float> undef) call <8 x float> @llvm.roundeven.v8f32(<8 x float> undef) call <16 x float> @llvm.roundeven.v16f32(<16 x float> undef) - call @llvm.roundeven.nvx1f32( undef) - call @llvm.roundeven.nvx2f32( undef) - call @llvm.roundeven.nvx4f32( undef) - call @llvm.roundeven.nvx8f32( undef) - call @llvm.roundeven.nvx16f32( undef) + call @llvm.roundeven.nxv1f32( undef) + call @llvm.roundeven.nxv2f32( undef) + call @llvm.roundeven.nxv4f32( undef) + call @llvm.roundeven.nxv8f32( undef) + call @llvm.roundeven.nxv16f32( undef) call double @llvm.roundeven.f64(double undef) call <2 x double> @llvm.roundeven.v2f64(<2 x double> undef) call <4 x double> @llvm.roundeven.v4f64(<4 x double> undef) call <8 x double> @llvm.roundeven.v8f64(<8 x double> undef) call <16 x double> @llvm.roundeven.v16f64(<16 x double> undef) - call @llvm.roundeven.nvx1f64( undef) - call @llvm.roundeven.nvx2f64( undef) - call @llvm.roundeven.nvx4f64( undef) - call @llvm.roundeven.nvx8f64( undef) + call @llvm.roundeven.nxv1f64( undef) + call @llvm.roundeven.nxv2f64( undef) + call @llvm.roundeven.nxv4f64( undef) + call @llvm.roundeven.nxv8f64( undef) ret void } @@ -913,11 +913,11 @@ define void @roundeven_fp16() { call <4 x half> @llvm.roundeven.v4f16(<4 x half> undef) call <8 x half> @llvm.roundeven.v8f16(<8 x half> undef) call <16 x half> @llvm.roundeven.v16f16(<16 x half> undef) - call @llvm.roundeven.nvx1f16( undef) - call @llvm.roundeven.nvx2f16( undef) - call @llvm.roundeven.nvx4f16( undef) - call @llvm.roundeven.nvx8f16( undef) - call @llvm.roundeven.nvx16f16( undef) + call @llvm.roundeven.nxv1f16( undef) + call @llvm.roundeven.nxv2f16( undef) + call @llvm.roundeven.nxv4f16( undef) + call @llvm.roundeven.nxv8f16( undef) + call @llvm.roundeven.nxv16f16( undef) ret void } @@ -955,28 +955,28 @@ define void @vp_ceil() { call <4 x bfloat> @llvm.vp.ceil.v4bf16(<4 x bfloat> undef, <4 x i1> undef, i32 undef) call <8 x bfloat> @llvm.vp.ceil.v8bf16(<8 x bfloat> undef, <8 x i1> undef, i32 undef) call <16 x bfloat> @llvm.vp.ceil.v16bf16(<16 x bfloat> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.ceil.nvx1bf16( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx2bf16( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx4bf16( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx8bf16( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx16bf16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv1bf16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv2bf16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv4bf16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv8bf16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv16bf16( undef, undef, i32 undef) call <2 x float> @llvm.vp.ceil.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) call <4 x float> @llvm.vp.ceil.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) call <8 x float> @llvm.vp.ceil.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) call <16 x float> @llvm.vp.ceil.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.ceil.nvx1f32( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx2f32( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx4f32( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx8f32( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx16f32( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv1f32( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv2f32( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv4f32( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv8f32( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv16f32( undef, undef, i32 undef) call <2 x double> @llvm.vp.ceil.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) call <4 x double> @llvm.vp.ceil.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) call <8 x double> @llvm.vp.ceil.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) call <16 x double> @llvm.vp.ceil.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.ceil.nvx1f64( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx2f64( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx4f64( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx8f64( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv1f64( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv2f64( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv4f64( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv8f64( undef, undef, i32 undef) ret void } @@ -1009,11 +1009,11 @@ define void @vp_ceil_f16() { call <4 x half> @llvm.vp.ceil.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) call <8 x half> @llvm.vp.ceil.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) call <16 x half> @llvm.vp.ceil.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.ceil.nvx1f16( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx2f16( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx4f16( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx8f16( undef, undef, i32 undef) - call @llvm.vp.ceil.nvx16f16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv1f16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv2f16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv4f16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv8f16( undef, undef, i32 undef) + call @llvm.vp.ceil.nxv16f16( undef, undef, i32 undef) ret void } @@ -1051,28 +1051,28 @@ define void @vp_floor() { call <4 x bfloat> @llvm.vp.floor.v4bf16(<4 x bfloat> undef, <4 x i1> undef, i32 undef) call <8 x bfloat> @llvm.vp.floor.v8bf16(<8 x bfloat> undef, <8 x i1> undef, i32 undef) call <16 x bfloat> @llvm.vp.floor.v16bf16(<16 x bfloat> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.floor.nvx1bf16( undef, undef, i32 undef) - call @llvm.vp.floor.nvx2bf16( undef, undef, i32 undef) - call @llvm.vp.floor.nvx4bf16( undef, undef, i32 undef) - call @llvm.vp.floor.nvx8bf16( undef, undef, i32 undef) - call @llvm.vp.floor.nvx16bf16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv1bf16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv2bf16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv4bf16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv8bf16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv16bf16( undef, undef, i32 undef) call <2 x float> @llvm.vp.floor.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) call <4 x float> @llvm.vp.floor.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) call <8 x float> @llvm.vp.floor.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) call <16 x float> @llvm.vp.floor.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.floor.nvx1f32( undef, undef, i32 undef) - call @llvm.vp.floor.nvx2f32( undef, undef, i32 undef) - call @llvm.vp.floor.nvx4f32( undef, undef, i32 undef) - call @llvm.vp.floor.nvx8f32( undef, undef, i32 undef) - call @llvm.vp.floor.nvx16f32( undef, undef, i32 undef) + call @llvm.vp.floor.nxv1f32( undef, undef, i32 undef) + call @llvm.vp.floor.nxv2f32( undef, undef, i32 undef) + call @llvm.vp.floor.nxv4f32( undef, undef, i32 undef) + call @llvm.vp.floor.nxv8f32( undef, undef, i32 undef) + call @llvm.vp.floor.nxv16f32( undef, undef, i32 undef) call <2 x double> @llvm.vp.floor.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) call <4 x double> @llvm.vp.floor.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) call <8 x double> @llvm.vp.floor.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) call <16 x double> @llvm.vp.floor.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.floor.nvx1f64( undef, undef, i32 undef) - call @llvm.vp.floor.nvx2f64( undef, undef, i32 undef) - call @llvm.vp.floor.nvx4f64( undef, undef, i32 undef) - call @llvm.vp.floor.nvx8f64( undef, undef, i32 undef) + call @llvm.vp.floor.nxv1f64( undef, undef, i32 undef) + call @llvm.vp.floor.nxv2f64( undef, undef, i32 undef) + call @llvm.vp.floor.nxv4f64( undef, undef, i32 undef) + call @llvm.vp.floor.nxv8f64( undef, undef, i32 undef) ret void } @@ -1105,11 +1105,11 @@ define void @vp_floor_f16() { call <4 x half> @llvm.vp.floor.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) call <8 x half> @llvm.vp.floor.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) call <16 x half> @llvm.vp.floor.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.floor.nvx1f16( undef, undef, i32 undef) - call @llvm.vp.floor.nvx2f16( undef, undef, i32 undef) - call @llvm.vp.floor.nvx4f16( undef, undef, i32 undef) - call @llvm.vp.floor.nvx8f16( undef, undef, i32 undef) - call @llvm.vp.floor.nvx16f16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv1f16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv2f16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv4f16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv8f16( undef, undef, i32 undef) + call @llvm.vp.floor.nxv16f16( undef, undef, i32 undef) ret void } @@ -1147,28 +1147,28 @@ define void @vp_round() { call <4 x bfloat> @llvm.vp.round.v4bf16(<4 x bfloat> undef, <4 x i1> undef, i32 undef) call <8 x bfloat> @llvm.vp.round.v8bf16(<8 x bfloat> undef, <8 x i1> undef, i32 undef) call <16 x bfloat> @llvm.vp.round.v16bf16(<16 x bfloat> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.round.nvx1bf16( undef, undef, i32 undef) - call @llvm.vp.round.nvx2bf16( undef, undef, i32 undef) - call @llvm.vp.round.nvx4bf16( undef, undef, i32 undef) - call @llvm.vp.round.nvx8bf16( undef, undef, i32 undef) - call @llvm.vp.round.nvx16bf16( undef, undef, i32 undef) + call @llvm.vp.round.nxv1bf16( undef, undef, i32 undef) + call @llvm.vp.round.nxv2bf16( undef, undef, i32 undef) + call @llvm.vp.round.nxv4bf16( undef, undef, i32 undef) + call @llvm.vp.round.nxv8bf16( undef, undef, i32 undef) + call @llvm.vp.round.nxv16bf16( undef, undef, i32 undef) call <2 x float> @llvm.vp.round.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) call <4 x float> @llvm.vp.round.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) call <8 x float> @llvm.vp.round.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) call <16 x float> @llvm.vp.round.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.round.nvx1f32( undef, undef, i32 undef) - call @llvm.vp.round.nvx2f32( undef, undef, i32 undef) - call @llvm.vp.round.nvx4f32( undef, undef, i32 undef) - call @llvm.vp.round.nvx8f32( undef, undef, i32 undef) - call @llvm.vp.round.nvx16f32( undef, undef, i32 undef) + call @llvm.vp.round.nxv1f32( undef, undef, i32 undef) + call @llvm.vp.round.nxv2f32( undef, undef, i32 undef) + call @llvm.vp.round.nxv4f32( undef, undef, i32 undef) + call @llvm.vp.round.nxv8f32( undef, undef, i32 undef) + call @llvm.vp.round.nxv16f32( undef, undef, i32 undef) call <2 x double> @llvm.vp.round.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) call <4 x double> @llvm.vp.round.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) call <8 x double> @llvm.vp.round.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) call <16 x double> @llvm.vp.round.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.round.nvx1f64( undef, undef, i32 undef) - call @llvm.vp.round.nvx2f64( undef, undef, i32 undef) - call @llvm.vp.round.nvx4f64( undef, undef, i32 undef) - call @llvm.vp.round.nvx8f64( undef, undef, i32 undef) + call @llvm.vp.round.nxv1f64( undef, undef, i32 undef) + call @llvm.vp.round.nxv2f64( undef, undef, i32 undef) + call @llvm.vp.round.nxv4f64( undef, undef, i32 undef) + call @llvm.vp.round.nxv8f64( undef, undef, i32 undef) ret void } @@ -1201,11 +1201,11 @@ define void @vp_round_f16() { call <4 x half> @llvm.vp.round.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) call <8 x half> @llvm.vp.round.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) call <16 x half> @llvm.vp.round.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.round.nvx1f16( undef, undef, i32 undef) - call @llvm.vp.round.nvx2f16( undef, undef, i32 undef) - call @llvm.vp.round.nvx4f16( undef, undef, i32 undef) - call @llvm.vp.round.nvx8f16( undef, undef, i32 undef) - call @llvm.vp.round.nvx16f16( undef, undef, i32 undef) + call @llvm.vp.round.nxv1f16( undef, undef, i32 undef) + call @llvm.vp.round.nxv2f16( undef, undef, i32 undef) + call @llvm.vp.round.nxv4f16( undef, undef, i32 undef) + call @llvm.vp.round.nxv8f16( undef, undef, i32 undef) + call @llvm.vp.round.nxv16f16( undef, undef, i32 undef) ret void } @@ -1243,28 +1243,28 @@ define void @vp_roundeven() { call <4 x bfloat> @llvm.vp.roundeven.v4bf16(<4 x bfloat> undef, <4 x i1> undef, i32 undef) call <8 x bfloat> @llvm.vp.roundeven.v8bf16(<8 x bfloat> undef, <8 x i1> undef, i32 undef) call <16 x bfloat> @llvm.vp.roundeven.v16bf16(<16 x bfloat> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.roundeven.nvx1bf16( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx2bf16( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx4bf16( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx8bf16( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx16bf16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv1bf16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv2bf16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv4bf16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv8bf16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv16bf16( undef, undef, i32 undef) call <2 x float> @llvm.vp.roundeven.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) call <4 x float> @llvm.vp.roundeven.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) call <8 x float> @llvm.vp.roundeven.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) call <16 x float> @llvm.vp.roundeven.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.roundeven.nvx1f32( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx2f32( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx4f32( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx8f32( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx16f32( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv1f32( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv2f32( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv4f32( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv8f32( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv16f32( undef, undef, i32 undef) call <2 x double> @llvm.vp.roundeven.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) call <4 x double> @llvm.vp.roundeven.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) call <8 x double> @llvm.vp.roundeven.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) call <16 x double> @llvm.vp.roundeven.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.roundeven.nvx1f64( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx2f64( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx4f64( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx8f64( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv1f64( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv2f64( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv4f64( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv8f64( undef, undef, i32 undef) ret void } @@ -1297,11 +1297,11 @@ define void @vp_roundeven_f16() { call <4 x half> @llvm.vp.roundeven.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) call <8 x half> @llvm.vp.roundeven.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) call <16 x half> @llvm.vp.roundeven.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.roundeven.nvx1f16( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx2f16( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx4f16( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx8f16( undef, undef, i32 undef) - call @llvm.vp.roundeven.nvx16f16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv1f16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv2f16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv4f16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv8f16( undef, undef, i32 undef) + call @llvm.vp.roundeven.nxv16f16( undef, undef, i32 undef) ret void } @@ -1339,28 +1339,28 @@ define void @vp_roundtozero() { call <4 x bfloat> @llvm.vp.roundtozero.v4bf16(<4 x bfloat> undef, <4 x i1> undef, i32 undef) call <8 x bfloat> @llvm.vp.roundtozero.v8bf16(<8 x bfloat> undef, <8 x i1> undef, i32 undef) call <16 x bfloat> @llvm.vp.roundtozero.v16bf16(<16 x bfloat> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.roundtozero.nvx1bf16( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx2bf16( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx4bf16( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx8bf16( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx16bf16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv1bf16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv2bf16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv4bf16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv8bf16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv16bf16( undef, undef, i32 undef) call <2 x float> @llvm.vp.roundtozero.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) call <4 x float> @llvm.vp.roundtozero.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) call <8 x float> @llvm.vp.roundtozero.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) call <16 x float> @llvm.vp.roundtozero.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.roundtozero.nvx1f32( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx2f32( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx4f32( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx8f32( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx16f32( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv1f32( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv2f32( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv4f32( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv8f32( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv16f32( undef, undef, i32 undef) call <2 x double> @llvm.vp.roundtozero.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) call <4 x double> @llvm.vp.roundtozero.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) call <8 x double> @llvm.vp.roundtozero.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) call <16 x double> @llvm.vp.roundtozero.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.roundtozero.nvx1f64( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx2f64( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx4f64( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx8f64( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv1f64( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv2f64( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv4f64( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv8f64( undef, undef, i32 undef) ret void } @@ -1393,11 +1393,11 @@ define void @vp_roundtozero_f16() { call <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) call <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) call <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.roundtozero.nvx1f16( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx2f16( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx4f16( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx8f16( undef, undef, i32 undef) - call @llvm.vp.roundtozero.nvx16f16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv1f16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv2f16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv4f16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv8f16( undef, undef, i32 undef) + call @llvm.vp.roundtozero.nxv16f16( undef, undef, i32 undef) ret void } @@ -1435,28 +1435,28 @@ define void @vp_rint() { call <4 x bfloat> @llvm.vp.rint.v4bf16(<4 x bfloat> undef, <4 x i1> undef, i32 undef) call <8 x bfloat> @llvm.vp.rint.v8bf16(<8 x bfloat> undef, <8 x i1> undef, i32 undef) call <16 x bfloat> @llvm.vp.rint.v16bf16(<16 x bfloat> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.rint.nvx1bf16( undef, undef, i32 undef) - call @llvm.vp.rint.nvx2bf16( undef, undef, i32 undef) - call @llvm.vp.rint.nvx4bf16( undef, undef, i32 undef) - call @llvm.vp.rint.nvx8bf16( undef, undef, i32 undef) - call @llvm.vp.rint.nvx16bf16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv1bf16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv2bf16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv4bf16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv8bf16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv16bf16( undef, undef, i32 undef) call <2 x float> @llvm.vp.rint.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) call <4 x float> @llvm.vp.rint.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) call <8 x float> @llvm.vp.rint.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) call <16 x float> @llvm.vp.rint.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.rint.nvx1f32( undef, undef, i32 undef) - call @llvm.vp.rint.nvx2f32( undef, undef, i32 undef) - call @llvm.vp.rint.nvx4f32( undef, undef, i32 undef) - call @llvm.vp.rint.nvx8f32( undef, undef, i32 undef) - call @llvm.vp.rint.nvx16f32( undef, undef, i32 undef) + call @llvm.vp.rint.nxv1f32( undef, undef, i32 undef) + call @llvm.vp.rint.nxv2f32( undef, undef, i32 undef) + call @llvm.vp.rint.nxv4f32( undef, undef, i32 undef) + call @llvm.vp.rint.nxv8f32( undef, undef, i32 undef) + call @llvm.vp.rint.nxv16f32( undef, undef, i32 undef) call <2 x double> @llvm.vp.rint.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) call <4 x double> @llvm.vp.rint.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) call <8 x double> @llvm.vp.rint.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) call <16 x double> @llvm.vp.rint.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.rint.nvx1f64( undef, undef, i32 undef) - call @llvm.vp.rint.nvx2f64( undef, undef, i32 undef) - call @llvm.vp.rint.nvx4f64( undef, undef, i32 undef) - call @llvm.vp.rint.nvx8f64( undef, undef, i32 undef) + call @llvm.vp.rint.nxv1f64( undef, undef, i32 undef) + call @llvm.vp.rint.nxv2f64( undef, undef, i32 undef) + call @llvm.vp.rint.nxv4f64( undef, undef, i32 undef) + call @llvm.vp.rint.nxv8f64( undef, undef, i32 undef) ret void } @@ -1489,11 +1489,11 @@ define void @vp_rint_f16() { call <4 x half> @llvm.vp.rint.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) call <8 x half> @llvm.vp.rint.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) call <16 x half> @llvm.vp.rint.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.rint.nvx1f16( undef, undef, i32 undef) - call @llvm.vp.rint.nvx2f16( undef, undef, i32 undef) - call @llvm.vp.rint.nvx4f16( undef, undef, i32 undef) - call @llvm.vp.rint.nvx8f16( undef, undef, i32 undef) - call @llvm.vp.rint.nvx16f16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv1f16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv2f16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv4f16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv8f16( undef, undef, i32 undef) + call @llvm.vp.rint.nxv16f16( undef, undef, i32 undef) ret void } @@ -1531,28 +1531,28 @@ define void @vp_nearbyint() { call <4 x bfloat> @llvm.vp.nearbyint.v4bf16(<4 x bfloat> undef, <4 x i1> undef, i32 undef) call <8 x bfloat> @llvm.vp.nearbyint.v8bf16(<8 x bfloat> undef, <8 x i1> undef, i32 undef) call <16 x bfloat> @llvm.vp.nearbyint.v16bf16(<16 x bfloat> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.nearbyint.nvx1bf16( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx2bf16( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx4bf16( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx8bf16( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx16bf16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv1bf16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv2bf16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv4bf16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv8bf16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv16bf16( undef, undef, i32 undef) call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef) call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef) call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef) call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.nearbyint.nvx1f32( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx2f32( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx4f32( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx8f32( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx16f32( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv1f32( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv2f32( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv4f32( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv8f32( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv16f32( undef, undef, i32 undef) call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef) call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef) call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef) call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.nearbyint.nvx1f64( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx2f64( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx4f64( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx8f64( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv1f64( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv2f64( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv4f64( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv8f64( undef, undef, i32 undef) ret void } @@ -1585,10 +1585,10 @@ define void @vp_nearbyint_f16() { call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef) call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef) call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.nearbyint.nvx1f16( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx2f16( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx4f16( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx8f16( undef, undef, i32 undef) - call @llvm.vp.nearbyint.nvx16f16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv1f16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv2f16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv4f16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv8f16( undef, undef, i32 undef) + call @llvm.vp.nearbyint.nxv16f16( undef, undef, i32 undef) ret void } diff --git a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll index b3b31d6f001ac..ea05464b08408 100644 --- a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll +++ b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll @@ -42,31 +42,31 @@ define void @bswap() { call <4 x i16> @llvm.bswap.v4i16(<4 x i16> undef) call <8 x i16> @llvm.bswap.v8i16(<8 x i16> undef) call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef) - call @llvm.bswap.nvx1i16( undef) - call @llvm.bswap.nvx2i16( undef) - call @llvm.bswap.nvx4i16( undef) - call @llvm.bswap.nvx8i16( undef) - call @llvm.bswap.nvx16i16( undef) + call @llvm.bswap.nxv1i16( undef) + call @llvm.bswap.nxv2i16( undef) + call @llvm.bswap.nxv4i16( undef) + call @llvm.bswap.nxv8i16( undef) + call @llvm.bswap.nxv16i16( undef) call i32 @llvm.bswap.i32(i32 undef) call <2 x i32> @llvm.bswap.v2i32(<2 x i32> undef) call <4 x i32> @llvm.bswap.v4i32(<4 x i32> undef) call <8 x i32> @llvm.bswap.v8i32(<8 x i32> undef) call <16 x i32> @llvm.bswap.v16i32(<16 x i32> undef) - call @llvm.bswap.nvx1i32( undef) - call @llvm.bswap.nvx2i32( undef) - call @llvm.bswap.nvx4i32( undef) - call @llvm.bswap.nvx8i32( undef) - call @llvm.bswap.nvx16i32( undef) + call @llvm.bswap.nxv1i32( undef) + call @llvm.bswap.nxv2i32( undef) + call @llvm.bswap.nxv4i32( undef) + call @llvm.bswap.nxv8i32( undef) + call @llvm.bswap.nxv16i32( undef) call i64 @llvm.bswap.i64(i64 undef) call <2 x i64> @llvm.bswap.v2i64(<2 x i64> undef) call <4 x i64> @llvm.bswap.v4i64(<4 x i64> undef) call <8 x i64> @llvm.bswap.v8i64(<8 x i64> undef) call <16 x i64> @llvm.bswap.v16i64(<16 x i64> undef) - call @llvm.bswap.nvx1i64( undef) - call @llvm.bswap.nvx2i64( undef) - call @llvm.bswap.nvx4i64( undef) - call @llvm.bswap.nvx8i64( undef) - call @llvm.bswap.nvx16i64( undef) + call @llvm.bswap.nxv1i64( undef) + call @llvm.bswap.nxv2i64( undef) + call @llvm.bswap.nxv4i64( undef) + call @llvm.bswap.nxv8i64( undef) + call @llvm.bswap.nxv16i64( undef) ret void } @@ -119,41 +119,41 @@ define void @bitreverse() { call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> undef) call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> undef) call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> undef) - call @llvm.bitreverse.nvx1i8( undef) - call @llvm.bitreverse.nvx2i8( undef) - call @llvm.bitreverse.nvx4i8( undef) - call @llvm.bitreverse.nvx8i8( undef) - call @llvm.bitreverse.nvx16i8( undef) + call @llvm.bitreverse.nxv1i8( undef) + call @llvm.bitreverse.nxv2i8( undef) + call @llvm.bitreverse.nxv4i8( undef) + call @llvm.bitreverse.nxv8i8( undef) + call @llvm.bitreverse.nxv16i8( undef) call i16 @llvm.bitreverse.i16(i16 undef) call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef) call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> undef) call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> undef) call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> undef) - call @llvm.bitreverse.nvx1i16( undef) - call @llvm.bitreverse.nvx2i16( undef) - call @llvm.bitreverse.nvx4i16( undef) - call @llvm.bitreverse.nvx8i16( undef) - call @llvm.bitreverse.nvx16i16( undef) + call @llvm.bitreverse.nxv1i16( undef) + call @llvm.bitreverse.nxv2i16( undef) + call @llvm.bitreverse.nxv4i16( undef) + call @llvm.bitreverse.nxv8i16( undef) + call @llvm.bitreverse.nxv16i16( undef) call i32 @llvm.bitreverse.i32(i32 undef) call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> undef) call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> undef) call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> undef) call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> undef) - call @llvm.bitreverse.nvx1i32( undef) - call @llvm.bitreverse.nvx2i32( undef) - call @llvm.bitreverse.nvx4i32( undef) - call @llvm.bitreverse.nvx8i32( undef) - call @llvm.bitreverse.nvx16i32( undef) + call @llvm.bitreverse.nxv1i32( undef) + call @llvm.bitreverse.nxv2i32( undef) + call @llvm.bitreverse.nxv4i32( undef) + call @llvm.bitreverse.nxv8i32( undef) + call @llvm.bitreverse.nxv16i32( undef) call i64 @llvm.bitreverse.i64(i64 undef) call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> undef) call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> undef) call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> undef) call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> undef) - call @llvm.bitreverse.nvx1i64( undef) - call @llvm.bitreverse.nvx2i64( undef) - call @llvm.bitreverse.nvx4i64( undef) - call @llvm.bitreverse.nvx8i64( undef) - call @llvm.bitreverse.nvx16i64( undef) + call @llvm.bitreverse.nxv1i64( undef) + call @llvm.bitreverse.nxv2i64( undef) + call @llvm.bitreverse.nxv4i64( undef) + call @llvm.bitreverse.nxv8i64( undef) + call @llvm.bitreverse.nxv16i64( undef) ret void } @@ -249,41 +249,41 @@ define void @ctpop() { call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> undef) call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> undef) call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> undef) - call @llvm.ctpop.nvx1i8( undef) - call @llvm.ctpop.nvx2i8( undef) - call @llvm.ctpop.nvx4i8( undef) - call @llvm.ctpop.nvx8i8( undef) - call @llvm.ctpop.nvx16i8( undef) + call @llvm.ctpop.nxv1i8( undef) + call @llvm.ctpop.nxv2i8( undef) + call @llvm.ctpop.nxv4i8( undef) + call @llvm.ctpop.nxv8i8( undef) + call @llvm.ctpop.nxv16i8( undef) call i16 @llvm.ctpop.i16(i16 undef) call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> undef) call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> undef) call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> undef) call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> undef) - call @llvm.ctpop.nvx1i16( undef) - call @llvm.ctpop.nvx2i16( undef) - call @llvm.ctpop.nvx4i16( undef) - call @llvm.ctpop.nvx8i16( undef) - call @llvm.ctpop.nvx16i16( undef) + call @llvm.ctpop.nxv1i16( undef) + call @llvm.ctpop.nxv2i16( undef) + call @llvm.ctpop.nxv4i16( undef) + call @llvm.ctpop.nxv8i16( undef) + call @llvm.ctpop.nxv16i16( undef) call i32 @llvm.ctpop.i32(i32 undef) call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> undef) call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> undef) call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> undef) call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> undef) - call @llvm.ctpop.nvx1i32( undef) - call @llvm.ctpop.nvx2i32( undef) - call @llvm.ctpop.nvx4i32( undef) - call @llvm.ctpop.nvx8i32( undef) - call @llvm.ctpop.nvx16i32( undef) + call @llvm.ctpop.nxv1i32( undef) + call @llvm.ctpop.nxv2i32( undef) + call @llvm.ctpop.nxv4i32( undef) + call @llvm.ctpop.nxv8i32( undef) + call @llvm.ctpop.nxv16i32( undef) call i64 @llvm.ctpop.i64(i64 undef) call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> undef) call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> undef) call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> undef) call <16 x i64> @llvm.ctpop.v16i64(<16 x i64> undef) - call @llvm.ctpop.nvx1i64( undef) - call @llvm.ctpop.nvx2i64( undef) - call @llvm.ctpop.nvx4i64( undef) - call @llvm.ctpop.nvx8i64( undef) - call @llvm.ctpop.nvx16i64( undef) + call @llvm.ctpop.nxv1i64( undef) + call @llvm.ctpop.nxv2i64( undef) + call @llvm.ctpop.nxv4i64( undef) + call @llvm.ctpop.nxv8i64( undef) + call @llvm.ctpop.nxv16i64( undef) ret void } @@ -322,29 +322,29 @@ define void @vp_bswap() { call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.bswap.nvx1i16( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx2i16( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx4i16( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx8i16( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx16i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv1i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv2i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv4i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv8i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv16i16( undef, undef, i32 undef) call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.bswap.nvx1i32( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx2i32( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx4i32( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx8i32( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx16i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv1i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv2i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv4i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv8i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv16i32( undef, undef, i32 undef) call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.bswap.nvx1i64( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx2i64( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx4i64( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx8i64( undef, undef, i32 undef) - call @llvm.vp.bswap.nvx16i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv1i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv2i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv4i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv8i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nxv16i64( undef, undef, i32 undef) ret void } @@ -392,38 +392,38 @@ define void @vp_ctpop() { call <4 x i16> @llvm.vp.ctpop.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) call <8 x i16> @llvm.vp.ctpop.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) call <16 x i16> @llvm.vp.ctpop.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.ctpop.nvx1i16( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx2i16( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx4i16( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx8i16( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx16i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv1i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv2i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv4i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv8i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv16i16( undef, undef, i32 undef) call <2 x i16> @llvm.vp.ctpop.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) call <4 x i16> @llvm.vp.ctpop.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) call <8 x i16> @llvm.vp.ctpop.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) call <16 x i16> @llvm.vp.ctpop.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.ctpop.nvx1i16( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx2i16( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx4i16( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx8i16( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx16i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv1i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv2i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv4i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv8i16( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv16i16( undef, undef, i32 undef) call <2 x i32> @llvm.vp.ctpop.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) call <4 x i32> @llvm.vp.ctpop.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) call <8 x i32> @llvm.vp.ctpop.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) call <16 x i32> @llvm.vp.ctpop.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.ctpop.nvx1i32( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx2i32( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx4i32( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx8i32( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx16i32( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv1i32( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv2i32( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv4i32( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv8i32( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv16i32( undef, undef, i32 undef) call <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) call <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) call <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) call <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) - call @llvm.vp.ctpop.nvx1i64( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx2i64( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx4i64( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx8i64( undef, undef, i32 undef) - call @llvm.vp.ctpop.nvx16i64( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv1i64( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv2i64( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv4i64( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv8i64( undef, undef, i32 undef) + call @llvm.vp.ctpop.nxv16i64( undef, undef, i32 undef) ret void } @@ -484,51 +484,51 @@ define void @vp_ctlz() { call <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.ctlz.nvx1i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx2i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx4i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx8i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx16i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx32i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx64i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv1i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv2i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv4i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv8i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv16i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv32i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv64i8( undef, i1 false, undef, i32 undef) call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> undef, i1 false, <2 x i1> undef, i32 undef) call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.ctlz.nvx1i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx2i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx4i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx8i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx16i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx32i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv1i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv2i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv4i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv8i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv16i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv32i16( undef, i1 false, undef, i32 undef) call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> undef, i1 false, <2 x i1> undef, i32 undef) call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.ctlz.nvx1i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx2i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx4i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx8i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx16i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx32i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv1i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv2i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv4i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv8i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv16i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv32i16( undef, i1 false, undef, i32 undef) call <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32> undef, i1 false, <2 x i1> undef, i32 undef) call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.ctlz.nvx1i32( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx2i32( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx4i32( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx8i32( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx16i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv1i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv2i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv4i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv8i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv16i32( undef, i1 false, undef, i32 undef) call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef) call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.ctlz.nvx1i64( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx2i64( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx4i64( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx8i64( undef, i1 false, undef, i32 undef) - call @llvm.vp.ctlz.nvx16i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv1i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv2i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv4i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv8i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.ctlz.nxv16i64( undef, i1 false, undef, i32 undef) ret void } @@ -589,51 +589,51 @@ define void @vp_cttz() { call <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.cttz.nvx1i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx2i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx4i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx8i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx16i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx32i8( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx64i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv1i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv2i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv4i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv8i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv16i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv32i8( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv64i8( undef, i1 false, undef, i32 undef) call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> undef, i1 false, <2 x i1> undef, i32 undef) call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.cttz.nvx1i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx2i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx4i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx8i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx16i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx32i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv1i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv2i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv4i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv8i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv16i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv32i16( undef, i1 false, undef, i32 undef) call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> undef, i1 false, <2 x i1> undef, i32 undef) call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.cttz.nvx1i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx2i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx4i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx8i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx16i16( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx32i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv1i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv2i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv4i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv8i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv16i16( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv32i16( undef, i1 false, undef, i32 undef) call <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32> undef, i1 false, <2 x i1> undef, i32 undef) call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.cttz.nvx1i32( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx2i32( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx4i32( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx8i32( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx16i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv1i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv2i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv4i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv8i32( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv16i32( undef, i1 false, undef, i32 undef) call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> undef, i1 false, <2 x i1> undef, i32 undef) call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> undef, i1 false, <4 x i1> undef, i32 undef) call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> undef, i1 false, <8 x i1> undef, i32 undef) call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> undef, i1 false, <16 x i1> undef, i32 undef) - call @llvm.vp.cttz.nvx1i64( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx2i64( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx4i64( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx8i64( undef, i1 false, undef, i32 undef) - call @llvm.vp.cttz.nvx16i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv1i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv2i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv4i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv8i64( undef, i1 false, undef, i32 undef) + call @llvm.vp.cttz.nxv16i64( undef, i1 false, undef, i32 undef) ret void } @@ -642,255 +642,255 @@ declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>) -declare @llvm.bswap.nvx1i16() -declare @llvm.bswap.nvx2i16() -declare @llvm.bswap.nvx4i16() -declare @llvm.bswap.nvx8i16() -declare @llvm.bswap.nvx16i16() +declare @llvm.bswap.nxv1i16() +declare @llvm.bswap.nxv2i16() +declare @llvm.bswap.nxv4i16() +declare @llvm.bswap.nxv8i16() +declare @llvm.bswap.nxv16i16() declare i32 @llvm.bswap.i32(i32) declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) declare <16 x i32> @llvm.bswap.v16i32(<16 x i32>) -declare @llvm.bswap.nvx1i32() -declare @llvm.bswap.nvx2i32() -declare @llvm.bswap.nvx4i32() -declare @llvm.bswap.nvx8i32() -declare @llvm.bswap.nvx16i32() +declare @llvm.bswap.nxv1i32() +declare @llvm.bswap.nxv2i32() +declare @llvm.bswap.nxv4i32() +declare @llvm.bswap.nxv8i32() +declare @llvm.bswap.nxv16i32() declare i64 @llvm.bswap.i64(i64) declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>) declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>) -declare @llvm.bswap.nvx1i64() -declare @llvm.bswap.nvx2i64() -declare @llvm.bswap.nvx4i64() -declare @llvm.bswap.nvx8i64() -declare @llvm.bswap.nvx16i64() +declare @llvm.bswap.nxv1i64() +declare @llvm.bswap.nxv2i64() +declare @llvm.bswap.nxv4i64() +declare @llvm.bswap.nxv8i64() +declare @llvm.bswap.nxv16i64() declare i8 @llvm.bitreverse.i8(i8) declare <2 x i8> @llvm.bitreverse.v2i8(<2 x i8>) declare <4 x i8> @llvm.bitreverse.v4i8(<4 x i8>) declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>) declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) -declare @llvm.bitreverse.nvx1i8() -declare @llvm.bitreverse.nvx2i8() -declare @llvm.bitreverse.nvx4i8() -declare @llvm.bitreverse.nvx8i8() -declare @llvm.bitreverse.nvx16i8() +declare @llvm.bitreverse.nxv1i8() +declare @llvm.bitreverse.nxv2i8() +declare @llvm.bitreverse.nxv4i8() +declare @llvm.bitreverse.nxv8i8() +declare @llvm.bitreverse.nxv16i8() declare i16 @llvm.bitreverse.i16(i16) declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) declare <4 x i16> @llvm.bitreverse.v4i16(<4 x i16>) declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) -declare @llvm.bitreverse.nvx1i16() -declare @llvm.bitreverse.nvx2i16() -declare @llvm.bitreverse.nvx4i16() -declare @llvm.bitreverse.nvx8i16() -declare @llvm.bitreverse.nvx16i16() +declare @llvm.bitreverse.nxv1i16() +declare @llvm.bitreverse.nxv2i16() +declare @llvm.bitreverse.nxv4i16() +declare @llvm.bitreverse.nxv8i16() +declare @llvm.bitreverse.nxv16i16() declare i32 @llvm.bitreverse.i32(i32) declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) -declare @llvm.bitreverse.nvx1i32() -declare @llvm.bitreverse.nvx2i32() -declare @llvm.bitreverse.nvx4i32() -declare @llvm.bitreverse.nvx8i32() -declare @llvm.bitreverse.nvx16i32() +declare @llvm.bitreverse.nxv1i32() +declare @llvm.bitreverse.nxv2i32() +declare @llvm.bitreverse.nxv4i32() +declare @llvm.bitreverse.nxv8i32() +declare @llvm.bitreverse.nxv16i32() declare i64 @llvm.bitreverse.i64(i64) declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) declare <16 x i64> @llvm.bitreverse.v16i64(<16 x i64>) -declare @llvm.bitreverse.nvx1i64() -declare @llvm.bitreverse.nvx2i64() -declare @llvm.bitreverse.nvx4i64() -declare @llvm.bitreverse.nvx8i64() -declare @llvm.bitreverse.nvx16i64() +declare @llvm.bitreverse.nxv1i64() +declare @llvm.bitreverse.nxv2i64() +declare @llvm.bitreverse.nxv4i64() +declare @llvm.bitreverse.nxv8i64() +declare @llvm.bitreverse.nxv16i64() declare i8 @llvm.ctpop.i8(i8) declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>) declare <4 x i8> @llvm.ctpop.v4i8(<4 x i8>) declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) -declare @llvm.ctpop.nvx1i8() -declare @llvm.ctpop.nvx2i8() -declare @llvm.ctpop.nvx4i8() -declare @llvm.ctpop.nvx8i8() -declare @llvm.ctpop.nvx16i8() +declare @llvm.ctpop.nxv1i8() +declare @llvm.ctpop.nxv2i8() +declare @llvm.ctpop.nxv4i8() +declare @llvm.ctpop.nxv8i8() +declare @llvm.ctpop.nxv16i8() declare i16 @llvm.ctpop.i16(i16) declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) -declare @llvm.ctpop.nvx1i16() -declare @llvm.ctpop.nvx2i16() -declare @llvm.ctpop.nvx4i16() -declare @llvm.ctpop.nvx8i16() -declare @llvm.ctpop.nvx16i16() +declare @llvm.ctpop.nxv1i16() +declare @llvm.ctpop.nxv2i16() +declare @llvm.ctpop.nxv4i16() +declare @llvm.ctpop.nxv8i16() +declare @llvm.ctpop.nxv16i16() declare i32 @llvm.ctpop.i32(i32) declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) -declare @llvm.ctpop.nvx1i32() -declare @llvm.ctpop.nvx2i32() -declare @llvm.ctpop.nvx4i32() -declare @llvm.ctpop.nvx8i32() -declare @llvm.ctpop.nvx16i32() +declare @llvm.ctpop.nxv1i32() +declare @llvm.ctpop.nxv2i32() +declare @llvm.ctpop.nxv4i32() +declare @llvm.ctpop.nxv8i32() +declare @llvm.ctpop.nxv16i32() declare i64 @llvm.ctpop.i64(i64) declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) -declare @llvm.ctpop.nvx1i64() -declare @llvm.ctpop.nvx2i64() -declare @llvm.ctpop.nvx4i64() -declare @llvm.ctpop.nvx8i64() -declare @llvm.ctpop.nvx16i64() +declare @llvm.ctpop.nxv1i64() +declare @llvm.ctpop.nxv2i64() +declare @llvm.ctpop.nxv4i64() +declare @llvm.ctpop.nxv8i64() +declare @llvm.ctpop.nxv16i64() declare <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16>, <2 x i1>, i32) declare <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16>, <4 x i1>, i32) declare <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16>, <8 x i1>, i32) declare <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16>, <16 x i1>, i32) -declare @llvm.vp.bswap.nvx1i16(, , i32) -declare @llvm.vp.bswap.nvx2i16(, , i32) -declare @llvm.vp.bswap.nvx4i16(, , i32) -declare @llvm.vp.bswap.nvx8i16(, , i32) -declare @llvm.vp.bswap.nvx16i16(, , i32) +declare @llvm.vp.bswap.nxv1i16(, , i32) +declare @llvm.vp.bswap.nxv2i16(, , i32) +declare @llvm.vp.bswap.nxv4i16(, , i32) +declare @llvm.vp.bswap.nxv8i16(, , i32) +declare @llvm.vp.bswap.nxv16i16(, , i32) declare <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32>, <2 x i1>, i32) declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32) declare <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32>, <8 x i1>, i32) declare <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32>, <16 x i1>, i32) -declare @llvm.vp.bswap.nvx1i32(, , i32) -declare @llvm.vp.bswap.nvx2i32(, , i32) -declare @llvm.vp.bswap.nvx4i32(, , i32) -declare @llvm.vp.bswap.nvx8i32(, , i32) -declare @llvm.vp.bswap.nvx16i32(, , i32) +declare @llvm.vp.bswap.nxv1i32(, , i32) +declare @llvm.vp.bswap.nxv2i32(, , i32) +declare @llvm.vp.bswap.nxv4i32(, , i32) +declare @llvm.vp.bswap.nxv8i32(, , i32) +declare @llvm.vp.bswap.nxv16i32(, , i32) declare <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64>, <2 x i1>, i32) declare <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64>, <4 x i1>, i32) declare <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64>, <8 x i1>, i32) declare <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64>, <16 x i1>, i32) -declare @llvm.vp.bswap.nvx1i64(, , i32) -declare @llvm.vp.bswap.nvx2i64(, , i32) -declare @llvm.vp.bswap.nvx4i64(, , i32) -declare @llvm.vp.bswap.nvx8i64(, , i32) -declare @llvm.vp.bswap.nvx16i64(, , i32) +declare @llvm.vp.bswap.nxv1i64(, , i32) +declare @llvm.vp.bswap.nxv2i64(, , i32) +declare @llvm.vp.bswap.nxv4i64(, , i32) +declare @llvm.vp.bswap.nxv8i64(, , i32) +declare @llvm.vp.bswap.nxv16i64(, , i32) declare <2 x i8> @llvm.vp.ctpop.v2i8(<2 x i8>, <2 x i1>, i32) declare <4 x i8> @llvm.vp.ctpop.v4i8(<4 x i8>, <4 x i1>, i32) declare <8 x i8> @llvm.vp.ctpop.v8i8(<8 x i8>, <8 x i1>, i32) declare <16 x i8> @llvm.vp.ctpop.v16i8(<16 x i8>, <16 x i1>, i32) -declare @llvm.vp.ctpop.nvx1i8(, , i32) -declare @llvm.vp.ctpop.nvx2i8(, , i32) -declare @llvm.vp.ctpop.nvx4i8(, , i32) -declare @llvm.vp.ctpop.nvx8i8(, , i32) -declare @llvm.vp.ctpop.nvx16i8(, , i32) +declare @llvm.vp.ctpop.nxv1i8(, , i32) +declare @llvm.vp.ctpop.nxv2i8(, , i32) +declare @llvm.vp.ctpop.nxv4i8(, , i32) +declare @llvm.vp.ctpop.nxv8i8(, , i32) +declare @llvm.vp.ctpop.nxv16i8(, , i32) declare <2 x i16> @llvm.vp.ctpop.v2i16(<2 x i16>, <2 x i1>, i32) declare <4 x i16> @llvm.vp.ctpop.v4i16(<4 x i16>, <4 x i1>, i32) declare <8 x i16> @llvm.vp.ctpop.v8i16(<8 x i16>, <8 x i1>, i32) declare <16 x i16> @llvm.vp.ctpop.v16i16(<16 x i16>, <16 x i1>, i32) -declare @llvm.vp.ctpop.nvx1i16(, , i32) -declare @llvm.vp.ctpop.nvx2i16(, , i32) -declare @llvm.vp.ctpop.nvx4i16(, , i32) -declare @llvm.vp.ctpop.nvx8i16(, , i32) -declare @llvm.vp.ctpop.nvx16i16(, , i32) +declare @llvm.vp.ctpop.nxv1i16(, , i32) +declare @llvm.vp.ctpop.nxv2i16(, , i32) +declare @llvm.vp.ctpop.nxv4i16(, , i32) +declare @llvm.vp.ctpop.nxv8i16(, , i32) +declare @llvm.vp.ctpop.nxv16i16(, , i32) declare <2 x i32> @llvm.vp.ctpop.v2i32(<2 x i32>, <2 x i1>, i32) declare <4 x i32> @llvm.vp.ctpop.v4i32(<4 x i32>, <4 x i1>, i32) declare <8 x i32> @llvm.vp.ctpop.v8i32(<8 x i32>, <8 x i1>, i32) declare <16 x i32> @llvm.vp.ctpop.v16i32(<16 x i32>, <16 x i1>, i32) -declare @llvm.vp.ctpop.nvx1i32(, , i32) -declare @llvm.vp.ctpop.nvx2i32(, , i32) -declare @llvm.vp.ctpop.nvx4i32(, , i32) -declare @llvm.vp.ctpop.nvx8i32(, , i32) -declare @llvm.vp.ctpop.nvx16i32(, , i32) +declare @llvm.vp.ctpop.nxv1i32(, , i32) +declare @llvm.vp.ctpop.nxv2i32(, , i32) +declare @llvm.vp.ctpop.nxv4i32(, , i32) +declare @llvm.vp.ctpop.nxv8i32(, , i32) +declare @llvm.vp.ctpop.nxv16i32(, , i32) declare <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64>, <2 x i1>, i32) declare <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64>, <4 x i1>, i32) declare <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64>, <8 x i1>, i32) declare <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64>, <16 x i1>, i32) -declare @llvm.vp.ctpop.nvx1i64(, , i32) -declare @llvm.vp.ctpop.nvx2i64(, , i32) -declare @llvm.vp.ctpop.nvx4i64(, , i32) -declare @llvm.vp.ctpop.nvx8i64(, , i32) -declare @llvm.vp.ctpop.nvx16i64(, , i32) +declare @llvm.vp.ctpop.nxv1i64(, , i32) +declare @llvm.vp.ctpop.nxv2i64(, , i32) +declare @llvm.vp.ctpop.nxv4i64(, , i32) +declare @llvm.vp.ctpop.nxv8i64(, , i32) +declare @llvm.vp.ctpop.nxv16i64(, , i32) declare <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8>, i1 immarg, <2 x i1>, i32) declare <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8>, i1 immarg, <4 x i1>, i32) declare <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8>, i1 immarg, <8 x i1>, i32) declare <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8>, i1 immarg, <16 x i1>, i32) -declare @llvm.vp.ctlz.nvx1i8(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx2i8(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx4i8(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx8i8(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx16i8(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx32i8(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx64i8(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv1i8(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv2i8(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv4i8(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv8i8(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv16i8(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv32i8(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv64i8(, i1 immarg, , i32) declare <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16>, i1 immarg, <2 x i1>, i32) declare <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16>, i1 immarg, <4 x i1>, i32) declare <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16>, i1 immarg, <8 x i1>, i32) declare <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16>, i1 immarg, <16 x i1>, i32) -declare @llvm.vp.ctlz.nvx1i16(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx2i16(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx4i16(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx8i16(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx16i16(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx32i16(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv1i16(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv2i16(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv4i16(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv8i16(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv16i16(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv32i16(, i1 immarg, , i32) declare <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32>, i1 immarg, <2 x i1>, i32) declare <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32>, i1 immarg, <4 x i1>, i32) declare <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32>, i1 immarg, <8 x i1>, i32) declare <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32>, i1 immarg, <16 x i1>, i32) -declare @llvm.vp.ctlz.nvx1i32(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx2i32(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx4i32(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx8i32(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx16i32(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv1i32(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv2i32(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv4i32(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv8i32(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv16i32(, i1 immarg, , i32) declare <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64>, i1 immarg, <2 x i1>, i32) declare <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64>, i1 immarg, <4 x i1>, i32) declare <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64>, i1 immarg, <8 x i1>, i32) declare <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32) -declare @llvm.vp.ctlz.nvx1i64(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx2i64(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx4i64(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx8i64(, i1 immarg, , i32) -declare @llvm.vp.ctlz.nvx16i64(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv1i64(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv2i64(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv4i64(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv8i64(, i1 immarg, , i32) +declare @llvm.vp.ctlz.nxv16i64(, i1 immarg, , i32) declare <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8>, i1 immarg, <2 x i1>, i32) declare <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8>, i1 immarg, <4 x i1>, i32) declare <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8>, i1 immarg, <8 x i1>, i32) declare <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8>, i1 immarg, <16 x i1>, i32) -declare @llvm.vp.cttz.nvx1i8(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx2i8(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx4i8(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx8i8(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx16i8(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx32i8(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx64i8(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv1i8(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv2i8(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv4i8(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv8i8(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv16i8(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv32i8(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv64i8(, i1 immarg, , i32) declare <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16>, i1 immarg, <2 x i1>, i32) declare <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16>, i1 immarg, <4 x i1>, i32) declare <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16>, i1 immarg, <8 x i1>, i32) declare <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16>, i1 immarg, <16 x i1>, i32) -declare @llvm.vp.cttz.nvx1i16(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx2i16(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx4i16(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx8i16(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx16i16(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx32i16(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv1i16(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv2i16(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv4i16(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv8i16(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv16i16(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv32i16(, i1 immarg, , i32) declare <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32>, i1 immarg, <2 x i1>, i32) declare <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32>, i1 immarg, <4 x i1>, i32) declare <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32>, i1 immarg, <8 x i1>, i32) declare <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32>, i1 immarg, <16 x i1>, i32) -declare @llvm.vp.cttz.nvx1i32(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx2i32(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx4i32(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx8i32(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx16i32(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv1i32(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv2i32(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv4i32(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv8i32(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv16i32(, i1 immarg, , i32) declare <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64>, i1 immarg, <2 x i1>, i32) declare <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64>, i1 immarg, <4 x i1>, i32) declare <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64>, i1 immarg, <8 x i1>, i32) declare <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64>, i1 immarg, <16 x i1>, i32) -declare @llvm.vp.cttz.nvx1i64(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx2i64(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx4i64(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx8i64(, i1 immarg, , i32) -declare @llvm.vp.cttz.nvx16i64(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv1i64(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv2i64(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv4i64(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv8i64(, i1 immarg, , i32) +declare @llvm.vp.cttz.nxv16i64(, i1 immarg, , i32) diff --git a/llvm/test/Analysis/CostModel/RISCV/int-min-max.ll b/llvm/test/Analysis/CostModel/RISCV/int-min-max.ll index 730b7ffb53d60..10474d227851f 100644 --- a/llvm/test/Analysis/CostModel/RISCV/int-min-max.ll +++ b/llvm/test/Analysis/CostModel/RISCV/int-min-max.ll @@ -49,40 +49,40 @@ define void @smax() { call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.smax.nvx1i8( undef, undef) - call @llvm.smax.nvx2i8( undef, undef) - call @llvm.smax.nvx4i8( undef, undef) - call @llvm.smax.nvx8i8( undef, undef) - call @llvm.smax.nvx16i8( undef, undef) + call @llvm.smax.nxv1i8( undef, undef) + call @llvm.smax.nxv2i8( undef, undef) + call @llvm.smax.nxv4i8( undef, undef) + call @llvm.smax.nxv8i8( undef, undef) + call @llvm.smax.nxv16i8( undef, undef) call i16 @llvm.smax.i16(i16 undef, i16 undef) call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.smax.nvx1i16( undef, undef) - call @llvm.smax.nvx2i16( undef, undef) - call @llvm.smax.nvx4i16( undef, undef) - call @llvm.smax.nvx8i16( undef, undef) - call @llvm.smax.nvx16i16( undef, undef) + call @llvm.smax.nxv1i16( undef, undef) + call @llvm.smax.nxv2i16( undef, undef) + call @llvm.smax.nxv4i16( undef, undef) + call @llvm.smax.nxv8i16( undef, undef) + call @llvm.smax.nxv16i16( undef, undef) call i32 @llvm.smax.i32(i32 undef, i32 undef) call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.smax.nvx1i32( undef, undef) - call @llvm.smax.nvx2i32( undef, undef) - call @llvm.smax.nvx4i32( undef, undef) - call @llvm.smax.nvx8i32( undef, undef) - call @llvm.smax.nvx16i32( undef, undef) + call @llvm.smax.nxv1i32( undef, undef) + call @llvm.smax.nxv2i32( undef, undef) + call @llvm.smax.nxv4i32( undef, undef) + call @llvm.smax.nxv8i32( undef, undef) + call @llvm.smax.nxv16i32( undef, undef) call i64 @llvm.smax.i64(i64 undef, i64 undef) call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.smax.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.smax.nvx1i64( undef, undef) - call @llvm.smax.nvx2i64( undef, undef) - call @llvm.smax.nvx4i64( undef, undef) - call @llvm.smax.nvx8i64( undef, undef) + call @llvm.smax.nxv1i64( undef, undef) + call @llvm.smax.nxv2i64( undef, undef) + call @llvm.smax.nxv4i64( undef, undef) + call @llvm.smax.nxv8i64( undef, undef) ret void } @@ -134,40 +134,40 @@ define void @smin() { call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.smin.nvx1i8( undef, undef) - call @llvm.smin.nvx2i8( undef, undef) - call @llvm.smin.nvx4i8( undef, undef) - call @llvm.smin.nvx8i8( undef, undef) - call @llvm.smin.nvx16i8( undef, undef) + call @llvm.smin.nxv1i8( undef, undef) + call @llvm.smin.nxv2i8( undef, undef) + call @llvm.smin.nxv4i8( undef, undef) + call @llvm.smin.nxv8i8( undef, undef) + call @llvm.smin.nxv16i8( undef, undef) call i16 @llvm.smin.i16(i16 undef, i16 undef) call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.smin.nvx1i16( undef, undef) - call @llvm.smin.nvx2i16( undef, undef) - call @llvm.smin.nvx4i16( undef, undef) - call @llvm.smin.nvx8i16( undef, undef) - call @llvm.smin.nvx16i16( undef, undef) + call @llvm.smin.nxv1i16( undef, undef) + call @llvm.smin.nxv2i16( undef, undef) + call @llvm.smin.nxv4i16( undef, undef) + call @llvm.smin.nxv8i16( undef, undef) + call @llvm.smin.nxv16i16( undef, undef) call i32 @llvm.smin.i32(i32 undef, i32 undef) call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.smin.nvx1i32( undef, undef) - call @llvm.smin.nvx2i32( undef, undef) - call @llvm.smin.nvx4i32( undef, undef) - call @llvm.smin.nvx8i32( undef, undef) - call @llvm.smin.nvx16i32( undef, undef) + call @llvm.smin.nxv1i32( undef, undef) + call @llvm.smin.nxv2i32( undef, undef) + call @llvm.smin.nxv4i32( undef, undef) + call @llvm.smin.nxv8i32( undef, undef) + call @llvm.smin.nxv16i32( undef, undef) call i64 @llvm.smin.i64(i64 undef, i64 undef) call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.smin.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.smin.nvx1i64( undef, undef) - call @llvm.smin.nvx2i64( undef, undef) - call @llvm.smin.nvx4i64( undef, undef) - call @llvm.smin.nvx8i64( undef, undef) + call @llvm.smin.nxv1i64( undef, undef) + call @llvm.smin.nxv2i64( undef, undef) + call @llvm.smin.nxv4i64( undef, undef) + call @llvm.smin.nxv8i64( undef, undef) ret void } @@ -219,40 +219,40 @@ define void @umax() { call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.umax.nvx1i8( undef, undef) - call @llvm.umax.nvx2i8( undef, undef) - call @llvm.umax.nvx4i8( undef, undef) - call @llvm.umax.nvx8i8( undef, undef) - call @llvm.umax.nvx16i8( undef, undef) + call @llvm.umax.nxv1i8( undef, undef) + call @llvm.umax.nxv2i8( undef, undef) + call @llvm.umax.nxv4i8( undef, undef) + call @llvm.umax.nxv8i8( undef, undef) + call @llvm.umax.nxv16i8( undef, undef) call i16 @llvm.umax.i16(i16 undef, i16 undef) call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.umax.nvx1i16( undef, undef) - call @llvm.umax.nvx2i16( undef, undef) - call @llvm.umax.nvx4i16( undef, undef) - call @llvm.umax.nvx8i16( undef, undef) - call @llvm.umax.nvx16i16( undef, undef) + call @llvm.umax.nxv1i16( undef, undef) + call @llvm.umax.nxv2i16( undef, undef) + call @llvm.umax.nxv4i16( undef, undef) + call @llvm.umax.nxv8i16( undef, undef) + call @llvm.umax.nxv16i16( undef, undef) call i32 @llvm.umax.i32(i32 undef, i32 undef) call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.umax.nvx1i32( undef, undef) - call @llvm.umax.nvx2i32( undef, undef) - call @llvm.umax.nvx4i32( undef, undef) - call @llvm.umax.nvx8i32( undef, undef) - call @llvm.umax.nvx16i32( undef, undef) + call @llvm.umax.nxv1i32( undef, undef) + call @llvm.umax.nxv2i32( undef, undef) + call @llvm.umax.nxv4i32( undef, undef) + call @llvm.umax.nxv8i32( undef, undef) + call @llvm.umax.nxv16i32( undef, undef) call i64 @llvm.umax.i64(i64 undef, i64 undef) call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.umax.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.umax.nvx1i64( undef, undef) - call @llvm.umax.nvx2i64( undef, undef) - call @llvm.umax.nvx4i64( undef, undef) - call @llvm.umax.nvx8i64( undef, undef) + call @llvm.umax.nxv1i64( undef, undef) + call @llvm.umax.nxv2i64( undef, undef) + call @llvm.umax.nxv4i64( undef, undef) + call @llvm.umax.nxv8i64( undef, undef) ret void } @@ -304,40 +304,40 @@ define void @umin() { call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.umin.nvx1i8( undef, undef) - call @llvm.umin.nvx2i8( undef, undef) - call @llvm.umin.nvx4i8( undef, undef) - call @llvm.umin.nvx8i8( undef, undef) - call @llvm.umin.nvx16i8( undef, undef) + call @llvm.umin.nxv1i8( undef, undef) + call @llvm.umin.nxv2i8( undef, undef) + call @llvm.umin.nxv4i8( undef, undef) + call @llvm.umin.nxv8i8( undef, undef) + call @llvm.umin.nxv16i8( undef, undef) call i16 @llvm.umin.i16(i16 undef, i16 undef) call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.umin.nvx1i16( undef, undef) - call @llvm.umin.nvx2i16( undef, undef) - call @llvm.umin.nvx4i16( undef, undef) - call @llvm.umin.nvx8i16( undef, undef) - call @llvm.umin.nvx16i16( undef, undef) + call @llvm.umin.nxv1i16( undef, undef) + call @llvm.umin.nxv2i16( undef, undef) + call @llvm.umin.nxv4i16( undef, undef) + call @llvm.umin.nxv8i16( undef, undef) + call @llvm.umin.nxv16i16( undef, undef) call i32 @llvm.umin.i32(i32 undef, i32 undef) call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.umin.nvx1i32( undef, undef) - call @llvm.umin.nvx2i32( undef, undef) - call @llvm.umin.nvx4i32( undef, undef) - call @llvm.umin.nvx8i32( undef, undef) - call @llvm.umin.nvx16i32( undef, undef) + call @llvm.umin.nxv1i32( undef, undef) + call @llvm.umin.nxv2i32( undef, undef) + call @llvm.umin.nxv4i32( undef, undef) + call @llvm.umin.nxv8i32( undef, undef) + call @llvm.umin.nxv16i32( undef, undef) call i64 @llvm.umin.i64(i64 undef, i64 undef) call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.umin.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.umin.nvx1i64( undef, undef) - call @llvm.umin.nvx2i64( undef, undef) - call @llvm.umin.nvx4i64( undef, undef) - call @llvm.umin.nvx8i64( undef, undef) + call @llvm.umin.nxv1i64( undef, undef) + call @llvm.umin.nxv2i64( undef, undef) + call @llvm.umin.nxv4i64( undef, undef) + call @llvm.umin.nxv8i64( undef, undef) ret void } @@ -346,157 +346,157 @@ declare <2 x i8> @llvm.smax.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.smax.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.smax.nvx1i8(, ) -declare @llvm.smax.nvx2i8(, ) -declare @llvm.smax.nvx4i8(, ) -declare @llvm.smax.nvx8i8(, ) -declare @llvm.smax.nvx16i8(, ) +declare @llvm.smax.nxv1i8(, ) +declare @llvm.smax.nxv2i8(, ) +declare @llvm.smax.nxv4i8(, ) +declare @llvm.smax.nxv8i8(, ) +declare @llvm.smax.nxv16i8(, ) declare i16 @llvm.smax.i16(i16, i16) declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.smax.nvx1i16(, ) -declare @llvm.smax.nvx2i16(, ) -declare @llvm.smax.nvx4i16(, ) -declare @llvm.smax.nvx8i16(, ) -declare @llvm.smax.nvx16i16(, ) +declare @llvm.smax.nxv1i16(, ) +declare @llvm.smax.nxv2i16(, ) +declare @llvm.smax.nxv4i16(, ) +declare @llvm.smax.nxv8i16(, ) +declare @llvm.smax.nxv16i16(, ) declare i32 @llvm.smax.i32(i32, i32) declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.smax.nvx1i32(, ) -declare @llvm.smax.nvx2i32(, ) -declare @llvm.smax.nvx4i32(, ) -declare @llvm.smax.nvx8i32(, ) -declare @llvm.smax.nvx16i32(, ) +declare @llvm.smax.nxv1i32(, ) +declare @llvm.smax.nxv2i32(, ) +declare @llvm.smax.nxv4i32(, ) +declare @llvm.smax.nxv8i32(, ) +declare @llvm.smax.nxv16i32(, ) declare i64 @llvm.smax.i64(i64, i64) declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.smax.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.smax.nvx1i64(, ) -declare @llvm.smax.nvx2i64(, ) -declare @llvm.smax.nvx4i64(, ) -declare @llvm.smax.nvx8i64(, ) +declare @llvm.smax.nxv1i64(, ) +declare @llvm.smax.nxv2i64(, ) +declare @llvm.smax.nxv4i64(, ) +declare @llvm.smax.nxv8i64(, ) declare i8 @llvm.smin.i8(i8, i8) declare <2 x i8> @llvm.smin.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.smin.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.smin.nvx1i8(, ) -declare @llvm.smin.nvx2i8(, ) -declare @llvm.smin.nvx4i8(, ) -declare @llvm.smin.nvx8i8(, ) -declare @llvm.smin.nvx16i8(, ) +declare @llvm.smin.nxv1i8(, ) +declare @llvm.smin.nxv2i8(, ) +declare @llvm.smin.nxv4i8(, ) +declare @llvm.smin.nxv8i8(, ) +declare @llvm.smin.nxv16i8(, ) declare i16 @llvm.smin.i16(i16, i16) declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.smin.nvx1i16(, ) -declare @llvm.smin.nvx2i16(, ) -declare @llvm.smin.nvx4i16(, ) -declare @llvm.smin.nvx8i16(, ) -declare @llvm.smin.nvx16i16(, ) +declare @llvm.smin.nxv1i16(, ) +declare @llvm.smin.nxv2i16(, ) +declare @llvm.smin.nxv4i16(, ) +declare @llvm.smin.nxv8i16(, ) +declare @llvm.smin.nxv16i16(, ) declare i32 @llvm.smin.i32(i32, i32) declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.smin.nvx1i32(, ) -declare @llvm.smin.nvx2i32(, ) -declare @llvm.smin.nvx4i32(, ) -declare @llvm.smin.nvx8i32(, ) -declare @llvm.smin.nvx16i32(, ) +declare @llvm.smin.nxv1i32(, ) +declare @llvm.smin.nxv2i32(, ) +declare @llvm.smin.nxv4i32(, ) +declare @llvm.smin.nxv8i32(, ) +declare @llvm.smin.nxv16i32(, ) declare i64 @llvm.smin.i64(i64, i64) declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.smin.nvx1i64(, ) -declare @llvm.smin.nvx2i64(, ) -declare @llvm.smin.nvx4i64(, ) -declare @llvm.smin.nvx8i64(, ) +declare @llvm.smin.nxv1i64(, ) +declare @llvm.smin.nxv2i64(, ) +declare @llvm.smin.nxv4i64(, ) +declare @llvm.smin.nxv8i64(, ) declare i8 @llvm.umax.i8(i8, i8) declare <2 x i8> @llvm.umax.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.umax.nvx1i8(, ) -declare @llvm.umax.nvx2i8(, ) -declare @llvm.umax.nvx4i8(, ) -declare @llvm.umax.nvx8i8(, ) -declare @llvm.umax.nvx16i8(, ) +declare @llvm.umax.nxv1i8(, ) +declare @llvm.umax.nxv2i8(, ) +declare @llvm.umax.nxv4i8(, ) +declare @llvm.umax.nxv8i8(, ) +declare @llvm.umax.nxv16i8(, ) declare i16 @llvm.umax.i16(i16, i16) declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.umax.nvx1i16(, ) -declare @llvm.umax.nvx2i16(, ) -declare @llvm.umax.nvx4i16(, ) -declare @llvm.umax.nvx8i16(, ) -declare @llvm.umax.nvx16i16(, ) +declare @llvm.umax.nxv1i16(, ) +declare @llvm.umax.nxv2i16(, ) +declare @llvm.umax.nxv4i16(, ) +declare @llvm.umax.nxv8i16(, ) +declare @llvm.umax.nxv16i16(, ) declare i32 @llvm.umax.i32(i32, i32) declare <2 x i32> @llvm.umax.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.umax.nvx1i32(, ) -declare @llvm.umax.nvx2i32(, ) -declare @llvm.umax.nvx4i32(, ) -declare @llvm.umax.nvx8i32(, ) -declare @llvm.umax.nvx16i32(, ) +declare @llvm.umax.nxv1i32(, ) +declare @llvm.umax.nxv2i32(, ) +declare @llvm.umax.nxv4i32(, ) +declare @llvm.umax.nxv8i32(, ) +declare @llvm.umax.nxv16i32(, ) declare i64 @llvm.umax.i64(i64, i64) declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.umax.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.umax.nvx1i64(, ) -declare @llvm.umax.nvx2i64(, ) -declare @llvm.umax.nvx4i64(, ) -declare @llvm.umax.nvx8i64(, ) +declare @llvm.umax.nxv1i64(, ) +declare @llvm.umax.nxv2i64(, ) +declare @llvm.umax.nxv4i64(, ) +declare @llvm.umax.nxv8i64(, ) declare i8 @llvm.umin.i8(i8, i8) declare <2 x i8> @llvm.umin.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.umin.nvx1i8(, ) -declare @llvm.umin.nvx2i8(, ) -declare @llvm.umin.nvx4i8(, ) -declare @llvm.umin.nvx8i8(, ) -declare @llvm.umin.nvx16i8(, ) +declare @llvm.umin.nxv1i8(, ) +declare @llvm.umin.nxv2i8(, ) +declare @llvm.umin.nxv4i8(, ) +declare @llvm.umin.nxv8i8(, ) +declare @llvm.umin.nxv16i8(, ) declare i16 @llvm.umin.i16(i16, i16) declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.umin.nvx1i16(, ) -declare @llvm.umin.nvx2i16(, ) -declare @llvm.umin.nvx4i16(, ) -declare @llvm.umin.nvx8i16(, ) -declare @llvm.umin.nvx16i16(, ) +declare @llvm.umin.nxv1i16(, ) +declare @llvm.umin.nxv2i16(, ) +declare @llvm.umin.nxv4i16(, ) +declare @llvm.umin.nxv8i16(, ) +declare @llvm.umin.nxv16i16(, ) declare i32 @llvm.umin.i32(i32, i32) declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.umin.nvx1i32(, ) -declare @llvm.umin.nvx2i32(, ) -declare @llvm.umin.nvx4i32(, ) -declare @llvm.umin.nvx8i32(, ) -declare @llvm.umin.nvx16i32(, ) +declare @llvm.umin.nxv1i32(, ) +declare @llvm.umin.nxv2i32(, ) +declare @llvm.umin.nxv4i32(, ) +declare @llvm.umin.nxv8i32(, ) +declare @llvm.umin.nxv16i32(, ) declare i64 @llvm.umin.i64(i64, i64) declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.umin.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.umin.nvx1i64(, ) -declare @llvm.umin.nvx2i64(, ) -declare @llvm.umin.nvx4i64(, ) -declare @llvm.umin.nvx8i64(, ) +declare @llvm.umin.nxv1i64(, ) +declare @llvm.umin.nxv2i64(, ) +declare @llvm.umin.nxv4i64(, ) +declare @llvm.umin.nxv8i64(, ) diff --git a/llvm/test/Analysis/CostModel/RISCV/int-sat-math.ll b/llvm/test/Analysis/CostModel/RISCV/int-sat-math.ll index 185fcc9ce8b33..be6b7c57d2252 100644 --- a/llvm/test/Analysis/CostModel/RISCV/int-sat-math.ll +++ b/llvm/test/Analysis/CostModel/RISCV/int-sat-math.ll @@ -45,36 +45,36 @@ define void @sadd.sat() { call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.sadd.sat.nvx2i8( undef, undef) - call @llvm.sadd.sat.nvx4i8( undef, undef) - call @llvm.sadd.sat.nvx8i8( undef, undef) - call @llvm.sadd.sat.nvx16i8( undef, undef) + call @llvm.sadd.sat.nxv2i8( undef, undef) + call @llvm.sadd.sat.nxv4i8( undef, undef) + call @llvm.sadd.sat.nxv8i8( undef, undef) + call @llvm.sadd.sat.nxv16i8( undef, undef) call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.sadd.sat.nvx2i16( undef, undef) - call @llvm.sadd.sat.nvx4i16( undef, undef) - call @llvm.sadd.sat.nvx8i16( undef, undef) - call @llvm.sadd.sat.nvx16i16( undef, undef) + call @llvm.sadd.sat.nxv2i16( undef, undef) + call @llvm.sadd.sat.nxv4i16( undef, undef) + call @llvm.sadd.sat.nxv8i16( undef, undef) + call @llvm.sadd.sat.nxv16i16( undef, undef) call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef) call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.sadd.sat.nvx2i32( undef, undef) - call @llvm.sadd.sat.nvx4i32( undef, undef) - call @llvm.sadd.sat.nvx8i32( undef, undef) - call @llvm.sadd.sat.nvx16i32( undef, undef) + call @llvm.sadd.sat.nxv2i32( undef, undef) + call @llvm.sadd.sat.nxv4i32( undef, undef) + call @llvm.sadd.sat.nxv8i32( undef, undef) + call @llvm.sadd.sat.nxv16i32( undef, undef) call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef) call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.sadd.sat.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.sadd.sat.nvx2i64( undef, undef) - call @llvm.sadd.sat.nvx4i64( undef, undef) - call @llvm.sadd.sat.nvx8i64( undef, undef) + call @llvm.sadd.sat.nxv2i64( undef, undef) + call @llvm.sadd.sat.nxv4i64( undef, undef) + call @llvm.sadd.sat.nxv8i64( undef, undef) ret void } @@ -122,36 +122,36 @@ define void @uadd.sat() { call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.uadd.sat.nvx2i8( undef, undef) - call @llvm.uadd.sat.nvx4i8( undef, undef) - call @llvm.uadd.sat.nvx8i8( undef, undef) - call @llvm.uadd.sat.nvx16i8( undef, undef) + call @llvm.uadd.sat.nxv2i8( undef, undef) + call @llvm.uadd.sat.nxv4i8( undef, undef) + call @llvm.uadd.sat.nxv8i8( undef, undef) + call @llvm.uadd.sat.nxv16i8( undef, undef) call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.uadd.sat.nvx2i16( undef, undef) - call @llvm.uadd.sat.nvx4i16( undef, undef) - call @llvm.uadd.sat.nvx8i16( undef, undef) - call @llvm.uadd.sat.nvx16i16( undef, undef) + call @llvm.uadd.sat.nxv2i16( undef, undef) + call @llvm.uadd.sat.nxv4i16( undef, undef) + call @llvm.uadd.sat.nxv8i16( undef, undef) + call @llvm.uadd.sat.nxv16i16( undef, undef) call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef) call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.uadd.sat.nvx2i32( undef, undef) - call @llvm.uadd.sat.nvx4i32( undef, undef) - call @llvm.uadd.sat.nvx8i32( undef, undef) - call @llvm.uadd.sat.nvx16i32( undef, undef) + call @llvm.uadd.sat.nxv2i32( undef, undef) + call @llvm.uadd.sat.nxv4i32( undef, undef) + call @llvm.uadd.sat.nxv8i32( undef, undef) + call @llvm.uadd.sat.nxv16i32( undef, undef) call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef) call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.uadd.sat.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.uadd.sat.nvx2i64( undef, undef) - call @llvm.uadd.sat.nvx4i64( undef, undef) - call @llvm.uadd.sat.nvx8i64( undef, undef) + call @llvm.uadd.sat.nxv2i64( undef, undef) + call @llvm.uadd.sat.nxv4i64( undef, undef) + call @llvm.uadd.sat.nxv8i64( undef, undef) ret void } @@ -199,36 +199,36 @@ define void @usub.sat() { call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.usub.sat.nvx2i8( undef, undef) - call @llvm.usub.sat.nvx4i8( undef, undef) - call @llvm.usub.sat.nvx8i8( undef, undef) - call @llvm.usub.sat.nvx16i8( undef, undef) + call @llvm.usub.sat.nxv2i8( undef, undef) + call @llvm.usub.sat.nxv4i8( undef, undef) + call @llvm.usub.sat.nxv8i8( undef, undef) + call @llvm.usub.sat.nxv16i8( undef, undef) call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.usub.sat.nvx2i16( undef, undef) - call @llvm.usub.sat.nvx4i16( undef, undef) - call @llvm.usub.sat.nvx8i16( undef, undef) - call @llvm.usub.sat.nvx16i16( undef, undef) + call @llvm.usub.sat.nxv2i16( undef, undef) + call @llvm.usub.sat.nxv4i16( undef, undef) + call @llvm.usub.sat.nxv8i16( undef, undef) + call @llvm.usub.sat.nxv16i16( undef, undef) call i32 @llvm.usub.sat.i32(i32 undef, i32 undef) call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.usub.sat.nvx2i32( undef, undef) - call @llvm.usub.sat.nvx4i32( undef, undef) - call @llvm.usub.sat.nvx8i32( undef, undef) - call @llvm.usub.sat.nvx16i32( undef, undef) + call @llvm.usub.sat.nxv2i32( undef, undef) + call @llvm.usub.sat.nxv4i32( undef, undef) + call @llvm.usub.sat.nxv8i32( undef, undef) + call @llvm.usub.sat.nxv16i32( undef, undef) call i64 @llvm.usub.sat.i64(i64 undef, i64 undef) call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.usub.sat.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.usub.sat.nvx2i64( undef, undef) - call @llvm.usub.sat.nvx4i64( undef, undef) - call @llvm.usub.sat.nvx8i64( undef, undef) + call @llvm.usub.sat.nxv2i64( undef, undef) + call @llvm.usub.sat.nxv4i64( undef, undef) + call @llvm.usub.sat.nxv8i64( undef, undef) ret void } @@ -276,36 +276,36 @@ define void @ssub.sat() { call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.ssub.sat.nvx2i8( undef, undef) - call @llvm.ssub.sat.nvx4i8( undef, undef) - call @llvm.ssub.sat.nvx8i8( undef, undef) - call @llvm.ssub.sat.nvx16i8( undef, undef) + call @llvm.ssub.sat.nxv2i8( undef, undef) + call @llvm.ssub.sat.nxv4i8( undef, undef) + call @llvm.ssub.sat.nxv8i8( undef, undef) + call @llvm.ssub.sat.nxv16i8( undef, undef) call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.ssub.sat.nvx2i16( undef, undef) - call @llvm.ssub.sat.nvx4i16( undef, undef) - call @llvm.ssub.sat.nvx8i16( undef, undef) - call @llvm.ssub.sat.nvx16i16( undef, undef) + call @llvm.ssub.sat.nxv2i16( undef, undef) + call @llvm.ssub.sat.nxv4i16( undef, undef) + call @llvm.ssub.sat.nxv8i16( undef, undef) + call @llvm.ssub.sat.nxv16i16( undef, undef) call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef) call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.ssub.sat.nvx2i32( undef, undef) - call @llvm.ssub.sat.nvx4i32( undef, undef) - call @llvm.ssub.sat.nvx8i32( undef, undef) - call @llvm.ssub.sat.nvx16i32( undef, undef) + call @llvm.ssub.sat.nxv2i32( undef, undef) + call @llvm.ssub.sat.nxv4i32( undef, undef) + call @llvm.ssub.sat.nxv8i32( undef, undef) + call @llvm.ssub.sat.nxv16i32( undef, undef) call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef) call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.ssub.sat.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.ssub.sat.nvx2i64( undef, undef) - call @llvm.ssub.sat.nvx4i64( undef, undef) - call @llvm.ssub.sat.nvx8i64( undef, undef) + call @llvm.ssub.sat.nxv2i64( undef, undef) + call @llvm.ssub.sat.nxv4i64( undef, undef) + call @llvm.ssub.sat.nxv8i64( undef, undef) ret void } @@ -353,36 +353,36 @@ define void @ushl.sat() { call <4 x i8> @llvm.ushl.sat.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.ushl.sat.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.ushl.sat.nvx2i8( undef, undef) - call @llvm.ushl.sat.nvx4i8( undef, undef) - call @llvm.ushl.sat.nvx8i8( undef, undef) - call @llvm.ushl.sat.nvx16i8( undef, undef) + call @llvm.ushl.sat.nxv2i8( undef, undef) + call @llvm.ushl.sat.nxv4i8( undef, undef) + call @llvm.ushl.sat.nxv8i8( undef, undef) + call @llvm.ushl.sat.nxv16i8( undef, undef) call i16 @llvm.ushl.sat.i16(i16 undef, i16 undef) call <2 x i16> @llvm.ushl.sat.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.ushl.sat.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.ushl.sat.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.ushl.sat.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.ushl.sat.nvx2i16( undef, undef) - call @llvm.ushl.sat.nvx4i16( undef, undef) - call @llvm.ushl.sat.nvx8i16( undef, undef) - call @llvm.ushl.sat.nvx16i16( undef, undef) + call @llvm.ushl.sat.nxv2i16( undef, undef) + call @llvm.ushl.sat.nxv4i16( undef, undef) + call @llvm.ushl.sat.nxv8i16( undef, undef) + call @llvm.ushl.sat.nxv16i16( undef, undef) call i32 @llvm.ushl.sat.i32(i32 undef, i32 undef) call <2 x i32> @llvm.ushl.sat.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.ushl.sat.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.ushl.sat.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.ushl.sat.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.ushl.sat.nvx2i32( undef, undef) - call @llvm.ushl.sat.nvx4i32( undef, undef) - call @llvm.ushl.sat.nvx8i32( undef, undef) - call @llvm.ushl.sat.nvx16i32( undef, undef) + call @llvm.ushl.sat.nxv2i32( undef, undef) + call @llvm.ushl.sat.nxv4i32( undef, undef) + call @llvm.ushl.sat.nxv8i32( undef, undef) + call @llvm.ushl.sat.nxv16i32( undef, undef) call i64 @llvm.ushl.sat.i64(i64 undef, i64 undef) call <2 x i64> @llvm.ushl.sat.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.ushl.sat.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.ushl.sat.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.ushl.sat.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.ushl.sat.nvx2i64( undef, undef) - call @llvm.ushl.sat.nvx4i64( undef, undef) - call @llvm.ushl.sat.nvx8i64( undef, undef) + call @llvm.ushl.sat.nxv2i64( undef, undef) + call @llvm.ushl.sat.nxv4i64( undef, undef) + call @llvm.ushl.sat.nxv8i64( undef, undef) ret void } @@ -430,36 +430,36 @@ define void @sshl.sat() { call <4 x i8> @llvm.sshl.sat.v4i8(<4 x i8> undef, <4 x i8> undef) call <8 x i8> @llvm.sshl.sat.v8i8(<8 x i8> undef, <8 x i8> undef) call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> undef, <16 x i8> undef) - call @llvm.sshl.sat.nvx2i8( undef, undef) - call @llvm.sshl.sat.nvx4i8( undef, undef) - call @llvm.sshl.sat.nvx8i8( undef, undef) - call @llvm.sshl.sat.nvx16i8( undef, undef) + call @llvm.sshl.sat.nxv2i8( undef, undef) + call @llvm.sshl.sat.nxv4i8( undef, undef) + call @llvm.sshl.sat.nxv8i8( undef, undef) + call @llvm.sshl.sat.nxv16i8( undef, undef) call i16 @llvm.sshl.sat.i16(i16 undef, i16 undef) call <2 x i16> @llvm.sshl.sat.v2i16(<2 x i16> undef, <2 x i16> undef) call <4 x i16> @llvm.sshl.sat.v4i16(<4 x i16> undef, <4 x i16> undef) call <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16> undef, <8 x i16> undef) call <16 x i16> @llvm.sshl.sat.v16i16(<16 x i16> undef, <16 x i16> undef) - call @llvm.sshl.sat.nvx2i16( undef, undef) - call @llvm.sshl.sat.nvx4i16( undef, undef) - call @llvm.sshl.sat.nvx8i16( undef, undef) - call @llvm.sshl.sat.nvx16i16( undef, undef) + call @llvm.sshl.sat.nxv2i16( undef, undef) + call @llvm.sshl.sat.nxv4i16( undef, undef) + call @llvm.sshl.sat.nxv8i16( undef, undef) + call @llvm.sshl.sat.nxv16i16( undef, undef) call i32 @llvm.sshl.sat.i32(i32 undef, i32 undef) call <2 x i32> @llvm.sshl.sat.v2i32(<2 x i32> undef, <2 x i32> undef) call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> undef, <4 x i32> undef) call <8 x i32> @llvm.sshl.sat.v8i32(<8 x i32> undef, <8 x i32> undef) call <16 x i32> @llvm.sshl.sat.v16i32(<16 x i32> undef, <16 x i32> undef) - call @llvm.sshl.sat.nvx2i32( undef, undef) - call @llvm.sshl.sat.nvx4i32( undef, undef) - call @llvm.sshl.sat.nvx8i32( undef, undef) - call @llvm.sshl.sat.nvx16i32( undef, undef) + call @llvm.sshl.sat.nxv2i32( undef, undef) + call @llvm.sshl.sat.nxv4i32( undef, undef) + call @llvm.sshl.sat.nxv8i32( undef, undef) + call @llvm.sshl.sat.nxv16i32( undef, undef) call i64 @llvm.sshl.sat.i64(i64 undef, i64 undef) call <2 x i64> @llvm.sshl.sat.v2i64(<2 x i64> undef, <2 x i64> undef) call <4 x i64> @llvm.sshl.sat.v4i64(<4 x i64> undef, <4 x i64> undef) call <8 x i64> @llvm.sshl.sat.v8i64(<8 x i64> undef, <8 x i64> undef) call <16 x i64> @llvm.sshl.sat.v16i64(<16 x i64> undef, <16 x i64> undef) - call @llvm.sshl.sat.nvx2i64( undef, undef) - call @llvm.sshl.sat.nvx4i64( undef, undef) - call @llvm.sshl.sat.nvx8i64( undef, undef) + call @llvm.sshl.sat.nxv2i64( undef, undef) + call @llvm.sshl.sat.nxv4i64( undef, undef) + call @llvm.sshl.sat.nxv8i64( undef, undef) ret void } @@ -468,213 +468,213 @@ declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.sadd.sat.nvx2i8(, ) -declare @llvm.sadd.sat.nvx4i8(, ) -declare @llvm.sadd.sat.nvx8i8(, ) -declare @llvm.sadd.sat.nvx16i8(, ) +declare @llvm.sadd.sat.nxv2i8(, ) +declare @llvm.sadd.sat.nxv4i8(, ) +declare @llvm.sadd.sat.nxv8i8(, ) +declare @llvm.sadd.sat.nxv16i8(, ) declare i16 @llvm.sadd.sat.i16(i16, i16) declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.sadd.sat.nvx2i16(, ) -declare @llvm.sadd.sat.nvx4i16(, ) -declare @llvm.sadd.sat.nvx8i16(, ) -declare @llvm.sadd.sat.nvx16i16(, ) +declare @llvm.sadd.sat.nxv2i16(, ) +declare @llvm.sadd.sat.nxv4i16(, ) +declare @llvm.sadd.sat.nxv8i16(, ) +declare @llvm.sadd.sat.nxv16i16(, ) declare i32 @llvm.sadd.sat.i32(i32, i32) declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.sadd.sat.nvx2i32(, ) -declare @llvm.sadd.sat.nvx4i32(, ) -declare @llvm.sadd.sat.nvx8i32(, ) -declare @llvm.sadd.sat.nvx16i32(, ) +declare @llvm.sadd.sat.nxv2i32(, ) +declare @llvm.sadd.sat.nxv4i32(, ) +declare @llvm.sadd.sat.nxv8i32(, ) +declare @llvm.sadd.sat.nxv16i32(, ) declare i64 @llvm.sadd.sat.i64(i64, i64) declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.sadd.sat.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.sadd.sat.nvx2i64(, ) -declare @llvm.sadd.sat.nvx4i64(, ) -declare @llvm.sadd.sat.nvx8i64(, ) +declare @llvm.sadd.sat.nxv2i64(, ) +declare @llvm.sadd.sat.nxv4i64(, ) +declare @llvm.sadd.sat.nxv8i64(, ) declare i8 @llvm.uadd.sat.i8(i8, i8) declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.uadd.sat.nvx2i8(, ) -declare @llvm.uadd.sat.nvx4i8(, ) -declare @llvm.uadd.sat.nvx8i8(, ) -declare @llvm.uadd.sat.nvx16i8(, ) +declare @llvm.uadd.sat.nxv2i8(, ) +declare @llvm.uadd.sat.nxv4i8(, ) +declare @llvm.uadd.sat.nxv8i8(, ) +declare @llvm.uadd.sat.nxv16i8(, ) declare i16 @llvm.uadd.sat.i16(i16, i16) declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.uadd.sat.nvx2i16(, ) -declare @llvm.uadd.sat.nvx4i16(, ) -declare @llvm.uadd.sat.nvx8i16(, ) -declare @llvm.uadd.sat.nvx16i16(, ) +declare @llvm.uadd.sat.nxv2i16(, ) +declare @llvm.uadd.sat.nxv4i16(, ) +declare @llvm.uadd.sat.nxv8i16(, ) +declare @llvm.uadd.sat.nxv16i16(, ) declare i32 @llvm.uadd.sat.i32(i32, i32) declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.uadd.sat.nvx2i32(, ) -declare @llvm.uadd.sat.nvx4i32(, ) -declare @llvm.uadd.sat.nvx8i32(, ) -declare @llvm.uadd.sat.nvx16i32(, ) +declare @llvm.uadd.sat.nxv2i32(, ) +declare @llvm.uadd.sat.nxv4i32(, ) +declare @llvm.uadd.sat.nxv8i32(, ) +declare @llvm.uadd.sat.nxv16i32(, ) declare i64 @llvm.uadd.sat.i64(i64, i64) declare <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.uadd.sat.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.uadd.sat.nvx2i64(, ) -declare @llvm.uadd.sat.nvx4i64(, ) -declare @llvm.uadd.sat.nvx8i64(, ) +declare @llvm.uadd.sat.nxv2i64(, ) +declare @llvm.uadd.sat.nxv4i64(, ) +declare @llvm.uadd.sat.nxv8i64(, ) declare i8 @llvm.usub.sat.i8(i8, i8) declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.usub.sat.nvx2i8(, ) -declare @llvm.usub.sat.nvx4i8(, ) -declare @llvm.usub.sat.nvx8i8(, ) -declare @llvm.usub.sat.nvx16i8(, ) +declare @llvm.usub.sat.nxv2i8(, ) +declare @llvm.usub.sat.nxv4i8(, ) +declare @llvm.usub.sat.nxv8i8(, ) +declare @llvm.usub.sat.nxv16i8(, ) declare i16 @llvm.usub.sat.i16(i16, i16) declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.usub.sat.nvx2i16(, ) -declare @llvm.usub.sat.nvx4i16(, ) -declare @llvm.usub.sat.nvx8i16(, ) -declare @llvm.usub.sat.nvx16i16(, ) +declare @llvm.usub.sat.nxv2i16(, ) +declare @llvm.usub.sat.nxv4i16(, ) +declare @llvm.usub.sat.nxv8i16(, ) +declare @llvm.usub.sat.nxv16i16(, ) declare i32 @llvm.usub.sat.i32(i32, i32) declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.usub.sat.nvx2i32(, ) -declare @llvm.usub.sat.nvx4i32(, ) -declare @llvm.usub.sat.nvx8i32(, ) -declare @llvm.usub.sat.nvx16i32(, ) +declare @llvm.usub.sat.nxv2i32(, ) +declare @llvm.usub.sat.nxv4i32(, ) +declare @llvm.usub.sat.nxv8i32(, ) +declare @llvm.usub.sat.nxv16i32(, ) declare i64 @llvm.usub.sat.i64(i64, i64) declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.usub.sat.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.usub.sat.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.usub.sat.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.usub.sat.nvx2i64(, ) -declare @llvm.usub.sat.nvx4i64(, ) -declare @llvm.usub.sat.nvx8i64(, ) +declare @llvm.usub.sat.nxv2i64(, ) +declare @llvm.usub.sat.nxv4i64(, ) +declare @llvm.usub.sat.nxv8i64(, ) declare i8 @llvm.ssub.sat.i8(i8, i8) declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.ssub.sat.nvx2i8(, ) -declare @llvm.ssub.sat.nvx4i8(, ) -declare @llvm.ssub.sat.nvx8i8(, ) -declare @llvm.ssub.sat.nvx16i8(, ) +declare @llvm.ssub.sat.nxv2i8(, ) +declare @llvm.ssub.sat.nxv4i8(, ) +declare @llvm.ssub.sat.nxv8i8(, ) +declare @llvm.ssub.sat.nxv16i8(, ) declare i16 @llvm.ssub.sat.i16(i16, i16) declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.ssub.sat.nvx2i16(, ) -declare @llvm.ssub.sat.nvx4i16(, ) -declare @llvm.ssub.sat.nvx8i16(, ) -declare @llvm.ssub.sat.nvx16i16(, ) +declare @llvm.ssub.sat.nxv2i16(, ) +declare @llvm.ssub.sat.nxv4i16(, ) +declare @llvm.ssub.sat.nxv8i16(, ) +declare @llvm.ssub.sat.nxv16i16(, ) declare i32 @llvm.ssub.sat.i32(i32, i32) declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.ssub.sat.nvx2i32(, ) -declare @llvm.ssub.sat.nvx4i32(, ) -declare @llvm.ssub.sat.nvx8i32(, ) -declare @llvm.ssub.sat.nvx16i32(, ) +declare @llvm.ssub.sat.nxv2i32(, ) +declare @llvm.ssub.sat.nxv4i32(, ) +declare @llvm.ssub.sat.nxv8i32(, ) +declare @llvm.ssub.sat.nxv16i32(, ) declare i64 @llvm.ssub.sat.i64(i64, i64) declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.ssub.sat.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.ssub.sat.nvx2i64(, ) -declare @llvm.ssub.sat.nvx4i64(, ) -declare @llvm.ssub.sat.nvx8i64(, ) +declare @llvm.ssub.sat.nxv2i64(, ) +declare @llvm.ssub.sat.nxv4i64(, ) +declare @llvm.ssub.sat.nxv8i64(, ) declare i8 @llvm.ushl.sat.i8(i8, i8) declare <2 x i8> @llvm.ushl.sat.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.ushl.sat.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.ushl.sat.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.ushl.sat.nvx2i8(, ) -declare @llvm.ushl.sat.nvx4i8(, ) -declare @llvm.ushl.sat.nvx8i8(, ) -declare @llvm.ushl.sat.nvx16i8(, ) +declare @llvm.ushl.sat.nxv2i8(, ) +declare @llvm.ushl.sat.nxv4i8(, ) +declare @llvm.ushl.sat.nxv8i8(, ) +declare @llvm.ushl.sat.nxv16i8(, ) declare i16 @llvm.ushl.sat.i16(i16, i16) declare <2 x i16> @llvm.ushl.sat.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.ushl.sat.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.ushl.sat.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.ushl.sat.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.ushl.sat.nvx2i16(, ) -declare @llvm.ushl.sat.nvx4i16(, ) -declare @llvm.ushl.sat.nvx8i16(, ) -declare @llvm.ushl.sat.nvx16i16(, ) +declare @llvm.ushl.sat.nxv2i16(, ) +declare @llvm.ushl.sat.nxv4i16(, ) +declare @llvm.ushl.sat.nxv8i16(, ) +declare @llvm.ushl.sat.nxv16i16(, ) declare i32 @llvm.ushl.sat.i32(i32, i32) declare <2 x i32> @llvm.ushl.sat.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.ushl.sat.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.ushl.sat.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.ushl.sat.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.ushl.sat.nvx2i32(, ) -declare @llvm.ushl.sat.nvx4i32(, ) -declare @llvm.ushl.sat.nvx8i32(, ) -declare @llvm.ushl.sat.nvx16i32(, ) +declare @llvm.ushl.sat.nxv2i32(, ) +declare @llvm.ushl.sat.nxv4i32(, ) +declare @llvm.ushl.sat.nxv8i32(, ) +declare @llvm.ushl.sat.nxv16i32(, ) declare i64 @llvm.ushl.sat.i64(i64, i64) declare <2 x i64> @llvm.ushl.sat.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.ushl.sat.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.ushl.sat.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.ushl.sat.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.ushl.sat.nvx2i64(, ) -declare @llvm.ushl.sat.nvx4i64(, ) -declare @llvm.ushl.sat.nvx8i64(, ) +declare @llvm.ushl.sat.nxv2i64(, ) +declare @llvm.ushl.sat.nxv4i64(, ) +declare @llvm.ushl.sat.nxv8i64(, ) declare i8 @llvm.sshl.sat.i8(i8, i8) declare <2 x i8> @llvm.sshl.sat.v2i8(<2 x i8>, <2 x i8>) declare <4 x i8> @llvm.sshl.sat.v4i8(<4 x i8>, <4 x i8>) declare <8 x i8> @llvm.sshl.sat.v8i8(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8>, <16 x i8>) -declare @llvm.sshl.sat.nvx2i8(, ) -declare @llvm.sshl.sat.nvx4i8(, ) -declare @llvm.sshl.sat.nvx8i8(, ) -declare @llvm.sshl.sat.nvx16i8(, ) +declare @llvm.sshl.sat.nxv2i8(, ) +declare @llvm.sshl.sat.nxv4i8(, ) +declare @llvm.sshl.sat.nxv8i8(, ) +declare @llvm.sshl.sat.nxv16i8(, ) declare i16 @llvm.sshl.sat.i16(i16, i16) declare <2 x i16> @llvm.sshl.sat.v2i16(<2 x i16>, <2 x i16>) declare <4 x i16> @llvm.sshl.sat.v4i16(<4 x i16>, <4 x i16>) declare <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16>, <8 x i16>) declare <16 x i16> @llvm.sshl.sat.v16i16(<16 x i16>, <16 x i16>) -declare @llvm.sshl.sat.nvx2i16(, ) -declare @llvm.sshl.sat.nvx4i16(, ) -declare @llvm.sshl.sat.nvx8i16(, ) -declare @llvm.sshl.sat.nvx16i16(, ) +declare @llvm.sshl.sat.nxv2i16(, ) +declare @llvm.sshl.sat.nxv4i16(, ) +declare @llvm.sshl.sat.nxv8i16(, ) +declare @llvm.sshl.sat.nxv16i16(, ) declare i32 @llvm.sshl.sat.i32(i32, i32) declare <2 x i32> @llvm.sshl.sat.v2i32(<2 x i32>, <2 x i32>) declare <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32>, <4 x i32>) declare <8 x i32> @llvm.sshl.sat.v8i32(<8 x i32>, <8 x i32>) declare <16 x i32> @llvm.sshl.sat.v16i32(<16 x i32>, <16 x i32>) -declare @llvm.sshl.sat.nvx2i32(, ) -declare @llvm.sshl.sat.nvx4i32(, ) -declare @llvm.sshl.sat.nvx8i32(, ) -declare @llvm.sshl.sat.nvx16i32(, ) +declare @llvm.sshl.sat.nxv2i32(, ) +declare @llvm.sshl.sat.nxv4i32(, ) +declare @llvm.sshl.sat.nxv8i32(, ) +declare @llvm.sshl.sat.nxv16i32(, ) declare i64 @llvm.sshl.sat.i64(i64, i64) declare <2 x i64> @llvm.sshl.sat.v2i64(<2 x i64>, <2 x i64>) declare <4 x i64> @llvm.sshl.sat.v4i64(<4 x i64>, <4 x i64>) declare <8 x i64> @llvm.sshl.sat.v8i64(<8 x i64>, <8 x i64>) declare <16 x i64> @llvm.sshl.sat.v16i64(<16 x i64>, <16 x i64>) -declare @llvm.sshl.sat.nvx2i64(, ) -declare @llvm.sshl.sat.nvx4i64(, ) -declare @llvm.sshl.sat.nvx8i64(, ) +declare @llvm.sshl.sat.nxv2i64(, ) +declare @llvm.sshl.sat.nxv4i64(, ) +declare @llvm.sshl.sat.nxv8i64(, ) diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll index 196e7376677a5..1762f701a9b2d 100644 --- a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll +++ b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll @@ -5,14 +5,14 @@ define void @reduce_fadd_bfloat() { ; FP-REDUCE-LABEL: 'reduce_fadd_bfloat' -; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 211 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 541 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 573 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) ; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV1 = call fast bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, undef) ; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV2 = call fast bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, undef) ; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV4 = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, undef) @@ -22,14 +22,14 @@ define void @reduce_fadd_bfloat() { ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SIZE-LABEL: 'reduce_fadd_bfloat' -; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) -; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) -; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) -; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) -; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) -; SIZE-NEXT: Cost Model: Invalid cost for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) -; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) -; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) ; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV1 = call fast bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, undef) ; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV2 = call fast bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, undef) ; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV4 = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, undef) @@ -74,14 +74,14 @@ define void @reduce_fadd_half() { ; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; FP-REDUCE-ZVFHMIN-LABEL: 'reduce_fadd_half' -; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %V1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef) -; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %V2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) -; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %V4 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %V8 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %V16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) -; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %v32 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef) -; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %V64 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef) -; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %V128 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 211 for instruction: %v32 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 541 for instruction: %V64 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 573 for instruction: %V128 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef) ; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV1 = call fast half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, undef) ; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV2 = call fast half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, undef) ; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV4 = call fast half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, undef) @@ -222,36 +222,36 @@ define void @reduce_fadd_double() { define void @reduce_ordered_fadd_bfloat() { ; FP-REDUCE-LABEL: 'reduce_ordered_fadd_bfloat' ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV1 = call bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NXV2 = call bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NXV4 = call bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %NXV8 = call bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat 0xR0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %NXV16 = call bfloat @llvm.vector.reduce.fadd.nxv16bf16(bfloat 0xR0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %NXV32 = call bfloat @llvm.vector.reduce.fadd.nxv32bf16(bfloat 0xR0000, undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 255 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 510 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV1 = call bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV2 = call bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV4 = call bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV8 = call bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat 0xR0000, undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV16 = call bfloat @llvm.vector.reduce.fadd.nxv16bf16(bfloat 0xR0000, undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %NXV32 = call bfloat @llvm.vector.reduce.fadd.nxv32bf16(bfloat 0xR0000, undef) ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SIZE-LABEL: 'reduce_ordered_fadd_bfloat' -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %NXV1 = call bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %NXV2 = call bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %NXV4 = call bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %NXV8 = call bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat 0xR0000, undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %NXV16 = call bfloat @llvm.vector.reduce.fadd.nxv16bf16(bfloat 0xR0000, undef) -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %NXV32 = call bfloat @llvm.vector.reduce.fadd.nxv32bf16(bfloat 0xR0000, undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 382 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV1 = call bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV2 = call bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV4 = call bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV8 = call bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat 0xR0000, undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV16 = call bfloat @llvm.vector.reduce.fadd.nxv16bf16(bfloat 0xR0000, undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %NXV32 = call bfloat @llvm.vector.reduce.fadd.nxv32bf16(bfloat 0xR0000, undef) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0.0, <1 x bfloat> undef) @@ -272,22 +272,39 @@ define void @reduce_ordered_fadd_bfloat() { } define void @reduce_ordered_fadd_half() { -; FP-REDUCE-LABEL: 'reduce_ordered_fadd_half' -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v32 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V128 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV1 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NXV2 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NXV4 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %NXV8 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %NXV16 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %NXV32 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, undef) -; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; FP-REDUCE-ZVFH-LABEL: 'reduce_ordered_fadd_half' +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v32 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V128 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV1 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NXV2 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NXV4 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %NXV8 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %NXV16 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %NXV32 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, undef) +; FP-REDUCE-ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; FP-REDUCE-ZVFHMIN-LABEL: 'reduce_ordered_fadd_half' +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %v32 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 255 for instruction: %V64 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 510 for instruction: %V128 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV1 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV2 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV4 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV8 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV16 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Invalid cost for instruction: %NXV32 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, undef) +; FP-REDUCE-ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SIZE-LABEL: 'reduce_ordered_fadd_half' ; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef) diff --git a/llvm/test/Analysis/MustExecute/irreducible-cfg.ll b/llvm/test/Analysis/MustExecute/irreducible-cfg.ll new file mode 100644 index 0000000000000..a452761ab3356 --- /dev/null +++ b/llvm/test/Analysis/MustExecute/irreducible-cfg.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -disable-output -passes=print-mustexecute %s 2>&1 | FileCheck %s + +; The loop body has two predecessors, %header and %side-entry. This leads to irreducible-cfg +define i64 @baz() { +; CHECK-LABEL: define i64 @baz() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: br label %[[BODY:.*]] ; (mustexec in: header) +; CHECK: [[SIDE_ENTRY:.*:]] +; CHECK-NEXT: br label %[[BODY]] +; CHECK: [[BODY]]: +; CHECK-NEXT: [[LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(1) null, align 8 ; (mustexec in: header) +; CHECK-NEXT: br label %[[HEADER]] ; (mustexec in: header) +; +entry: + br label %header + +header: + br label %body + +side-entry: + br label %body + +body: + %load = load ptr addrspace(1), ptr addrspace(1) null, align 8 + br label %header +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/icmp-flags.mir b/llvm/test/CodeGen/AArch64/GlobalISel/icmp-flags.mir new file mode 100644 index 0000000000000..59e4de9440416 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/icmp-flags.mir @@ -0,0 +1,45 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=none -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: icmp_samesign +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: icmp_samesign + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: %y:_(s32) = COPY $w1 + ; CHECK-NEXT: %cmp:_(s1) = samesign G_ICMP intpred(eq), %y(s32), %y + ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %cmp(s1) + ; CHECK-NEXT: $w0 = COPY %zext(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %x:_(s32) = COPY $w0 + %y:_(s32) = COPY $w1 + %cmp:_(s1) = samesign G_ICMP intpred(eq), %y:_(s32), %y:_ + %zext:_(s32) = G_ZEXT %cmp:_(s1) + $w0 = COPY %zext + RET_ReallyLR implicit $w0 +... +--- +name: icmp_differentsign +body: | + bb.0: + liveins: $w0, $w1 + ; CHECK-LABEL: name: icmp_differentsign + ; CHECK: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: %y:_(s32) = COPY $w1 + ; CHECK-NEXT: %cmp:_(s1) = G_ICMP intpred(eq), %y(s32), %y + ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %cmp(s1) + ; CHECK-NEXT: $w0 = COPY %zext(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %x:_(s32) = COPY $w0 + %y:_(s32) = COPY $w1 + %cmp:_(s1) = G_ICMP intpred(eq), %y:_(s32), %y:_ + %zext:_(s32) = G_ZEXT %cmp:_(s1) + $w0 = COPY %zext + RET_ReallyLR implicit $w0 +--- diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslater-samesign.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslater-samesign.ll new file mode 100644 index 0000000000000..0173f92c98220 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslater-samesign.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel -mtriple=aarch64-linux-gnu -O0 -stop-after=irtranslator < %s | FileCheck %s + + +define <2 x i1> @call_icmp_samesign_vector(<2 x i32> %a, <2 x i32> %b) { + ; CHECK-LABEL: name: call_icmp_samesign_vector + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: %2:_(<2 x s1>) = samesign G_ICMP intpred(ult), [[COPY]](<2 x s32>), [[COPY1]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT %2(<2 x s1>) + ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 +entry: + %result = icmp samesign ult <2 x i32> %a, %b + ret <2 x i1> %result +} + +define <2 x i1> @call_icmp_vector(<2 x i32> %a, <2 x i32> %b) { + ; CHECK-LABEL: name: call_icmp_vector + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $d0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<2 x s1>) = G_ICMP intpred(ult), [[COPY]](<2 x s32>), [[COPY1]] + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[ICMP]](<2 x s1>) + ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 +entry: + %result = icmp ult <2 x i32> %a, %b + ret <2 x i1> %result +} + +define i1 @call_icmp(i32 %a) { + ; CHECK-LABEL: name: call_icmp + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT [[ICMP]](s1) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ZEXT]](s8) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %result = icmp ult i32 %a, 3 + ret i1 %result +} + +define i1 @call_icmp_samesign(i32 %a) { + ; CHECK-LABEL: name: call_icmp_samesign + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %2:_(s1) = samesign G_ICMP intpred(ult), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %2(s1) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ZEXT]](s8) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %result = icmp samesign ult i32 %a, 3 + ret i1 %result +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-splat-vector.ll b/llvm/test/CodeGen/AArch64/GlobalISel/select-splat-vector.ll new file mode 100644 index 0000000000000..0193952aa2ab2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-splat-vector.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple aarch64 -mattr=+sve -aarch64-enable-gisel-sve=1 | FileCheck %s --check-prefixes=CHECK,CHECK-SDAG +; RUN: llc < %s -mtriple aarch64 -mattr=+sve -global-isel -aarch64-enable-gisel-sve=1 | FileCheck %s --check-prefixes=CHECK,CHECK-GS + +; REQUIRES: asserts, aarch64-registered-target + +;; add +define @addnxv2i64( %a, %b) { +; CHECK-SDAG-LABEL: addnxv2i64: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: add z0.d, z0.d, #9 // =0x9 +; CHECK-SDAG-NEXT: ret +; +; CHECK-GS-LABEL: addnxv2i64: +; CHECK-GS: // %bb.0: // %entry +; CHECK-GS-NEXT: mov w8, #9 // =0x9 +; CHECK-GS-NEXT: mov z1.d, x8 +; CHECK-GS-NEXT: add z0.d, z0.d, z1.d +; CHECK-GS-NEXT: ret +entry: + %c = add %a, splat (i64 9) + ret %c +} + +define @splarnxv2i64( %a, %b) { +; CHECK-SDAG-LABEL: splarnxv2i64: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: mov z0.d, #9 // =0x9 +; CHECK-SDAG-NEXT: ret +; +; CHECK-GS-LABEL: splarnxv2i64: +; CHECK-GS: // %bb.0: // %entry +; CHECK-GS-NEXT: mov w8, #9 // =0x9 +; CHECK-GS-NEXT: mov z0.d, x8 +; CHECK-GS-NEXT: ret +entry: + ret splat (i64 9) +} + +define @addnxv4i32( %a, %b) { +; CHECK-SDAG-LABEL: addnxv4i32: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: add z0.s, z0.s, #9 // =0x9 +; CHECK-SDAG-NEXT: ret +; +; CHECK-GS-LABEL: addnxv4i32: +; CHECK-GS: // %bb.0: // %entry +; CHECK-GS-NEXT: mov w8, #9 // =0x9 +; CHECK-GS-NEXT: mov z1.s, w8 +; CHECK-GS-NEXT: add z0.s, z0.s, z1.s +; CHECK-GS-NEXT: ret +entry: + %c = add %a, splat (i32 9) + ret %c +} + +define @splatnxv4i32( %a, %b) { +; CHECK-SDAG-LABEL: splatnxv4i32: +; CHECK-SDAG: // %bb.0: // %entry +; CHECK-SDAG-NEXT: mov z0.s, #9 // =0x9 +; CHECK-SDAG-NEXT: ret +; +; CHECK-GS-LABEL: splatnxv4i32: +; CHECK-GS: // %bb.0: // %entry +; CHECK-GS-NEXT: mov w8, #9 // =0x9 +; CHECK-GS-NEXT: mov z0.s, w8 +; CHECK-GS-NEXT: ret +entry: + ret splat (i32 9) +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/avg.ll b/llvm/test/CodeGen/AArch64/avg.ll index cabc0d346b806..ea07b10c22c2e 100644 --- a/llvm/test/CodeGen/AArch64/avg.ll +++ b/llvm/test/CodeGen/AArch64/avg.ll @@ -146,3 +146,205 @@ define <16 x i16> @sext_avgceils_mismatch(<16 x i4> %a0, <16 x i8> %a1) { %avg = sub <16 x i16> %or, %shift ret <16 x i16> %avg } + +define <8 x i16> @add_avgflooru(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgflooru: +; CHECK: // %bb.0: +; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %add = add nuw <8 x i16> %a0, %a1 + %avg = lshr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgflooru_mismatch(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgflooru_mismatch: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #1 +; CHECK-NEXT: ret + %add = add <8 x i16> %a0, %a1 + %avg = lshr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceilu(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceilu: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %add0 = add nuw <8 x i16> %a0, splat(i16 1) + %add = add nuw <8 x i16> %a1, %add0 + %avg = lshr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceilu2(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceilu2: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: ret + %add0 = add nuw <8 x i16> %a1, %a0 + %add = add nuw <8 x i16> %add0, splat(i16 1) + %avg = lshr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceilu_mismatch1(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceilu_mismatch1: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.8h, #1 +; CHECK-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uhadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ret + %add0 = add <8 x i16> %a1, %a0 + %add = add nuw <8 x i16> %add0, splat(i16 1) + %avg = lshr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceilu_mismatch2(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceilu_mismatch2: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #1 +; CHECK-NEXT: ret + %add0 = add nuw <8 x i16> %a1, %a0 + %add = add <8 x i16> %add0, splat(i16 1) + %avg = lshr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceilu_mismatch3(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceilu_mismatch3: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #1 +; CHECK-NEXT: ret + %add0 = add nuw <8 x i16> %a1, %a0 + %add = add <8 x i16> %add0, splat(i16 1) + %avg = lshr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgfloors(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgfloors: +; CHECK: // %bb.0: +; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %add = add nsw <8 x i16> %a0, %a1 + %avg = ashr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgfloors_mismatch(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgfloors_mismatch: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #1 +; CHECK-NEXT: ret + %add = add <8 x i16> %a0, %a1 + %avg = ashr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgfoor_mismatch2(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgfoor_mismatch2: +; CHECK: // %bb.0: +; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #2 +; CHECK-NEXT: ret + %add = add nsw <8 x i16> %a0, %a1 + %avg = ashr <8 x i16> %add, splat(i16 2) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceils(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceils: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %add0 = add nsw <8 x i16> %a0, splat(i16 1) + %add = add nsw <8 x i16> %a1, %add0 + %avg = ashr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceils2(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceils2: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: ret + %add0 = add nsw <8 x i16> %a1, %a0 + %add = add nsw <8 x i16> %add0, splat(i16 1) + %avg = ashr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceils_mismatch1(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceils_mismatch1: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.8h, #1 +; CHECK-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-NEXT: shadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ret + %add0 = add <8 x i16> %a1, %a0 + %add = add nsw <8 x i16> %add0, splat(i16 1) + %avg = ashr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceils_mismatch2(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceils_mismatch2: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #1 +; CHECK-NEXT: ret + %add0 = add nsw <8 x i16> %a1, %a0 + %add = add <8 x i16> %add0, splat(i16 1) + %avg = ashr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceils_mismatch3(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceils_mismatch3: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #1 +; CHECK-NEXT: ret + %add0 = add nsw <8 x i16> %a1, %a0 + %add = add <8 x i16> %add0, splat(i16 1) + %avg = ashr <8 x i16> %add, splat(i16 1) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceils_mismatch4(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceils_mismatch4: +; CHECK: // %bb.0: +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #2 +; CHECK-NEXT: ret + %add0 = add nsw <8 x i16> %a0, splat(i16 1) + %add = add nsw <8 x i16> %a1, %add0 + %avg = ashr <8 x i16> %add, splat(i16 2) + ret <8 x i16> %avg +} + +define <8 x i16> @add_avgceilu_mismatch(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: add_avgceilu_mismatch: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.8h, #1 +; CHECK-NEXT: add v0.8h, v1.8h, v0.8h +; CHECK-NEXT: add v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #2 +; CHECK-NEXT: ret + %add0 = add nuw <8 x i16> %a1, %a0 + %add = add nuw <8 x i16> %add0, splat(i16 1) + %avg = lshr <8 x i16> %add, splat(i16 2) + ret <8 x i16> %avg +} diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt-sve.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt-sve.ll new file mode 100644 index 0000000000000..75c8f8923c381 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt-sve.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64 -mattr=+sve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64 -mattr=+sve -aarch64-enable-gisel-sve=1 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +define @insert_vscale_2_i64_zero( %vec, i64 %elt) { +; CHECK-SD-LABEL: insert_vscale_2_i64_zero: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ptrue p0.d, vl1 +; CHECK-SD-NEXT: mov z0.d, p0/m, x0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_vscale_2_i64_zero: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov x8, xzr +; CHECK-GI-NEXT: index z1.d, #0, #1 +; CHECK-GI-NEXT: ptrue p0.d +; CHECK-GI-NEXT: mov z2.d, x8 +; CHECK-GI-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d +; CHECK-GI-NEXT: mov z0.d, p0/m, x0 +; CHECK-GI-NEXT: ret +entry: + %d = insertelement %vec, i64 %elt, i64 0 + ret %d +} + +define @insert_vscale_2_i64( %vec, i64 %elt, i64 %idx) { +; CHECK-LABEL: insert_vscale_2_i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z2.d, x1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d +; CHECK-NEXT: mov z0.d, p0/m, x0 +; CHECK-NEXT: ret +entry: + %d = insertelement %vec, i64 %elt, i64 %idx + ret %d +} + +define @insert_vscale_4_i32_zero( %vec, i32 %elt) { +; CHECK-SD-LABEL: insert_vscale_4_i32_zero: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ptrue p0.s, vl1 +; CHECK-SD-NEXT: mov z0.s, p0/m, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_vscale_4_i32_zero: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, wzr +; CHECK-GI-NEXT: index z1.s, #0, #1 +; CHECK-GI-NEXT: ptrue p0.s +; CHECK-GI-NEXT: mov z2.s, w8 +; CHECK-GI-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s +; CHECK-GI-NEXT: mov z0.s, p0/m, w0 +; CHECK-GI-NEXT: ret +entry: + %d = insertelement %vec, i32 %elt, i64 0 + ret %d +} + +define @insert_vscale_4_i32( %vec, i32 %elt, i64 %idx) { +; CHECK-LABEL: insert_vscale_4_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: mov z2.s, w1 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s +; CHECK-NEXT: mov z0.s, p0/m, w0 +; CHECK-NEXT: ret +entry: + %d = insertelement %vec, i32 %elt, i64 %idx + ret %d +} + +define @insert_vscale_8_i16_zero( %vec, i16 %elt) { +; CHECK-LABEL: insert_vscale_8_i16_zero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h, vl1 +; CHECK-NEXT: mov z0.h, p0/m, w0 +; CHECK-NEXT: ret +entry: + %d = insertelement %vec, i16 %elt, i64 0 + ret %d +} + +define @insert_vscale_8_i16( %vec, i16 %elt, i64 %idx) { +; CHECK-LABEL: insert_vscale_8_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: mov z2.h, w1 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h +; CHECK-NEXT: mov z0.h, p0/m, w0 +; CHECK-NEXT: ret +entry: + %d = insertelement %vec, i16 %elt, i64 %idx + ret %d +} + +define @insert_vscale_16_i8_zero( %vec, i8 %elt) { +; CHECK-LABEL: insert_vscale_16_i8_zero: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.b, vl1 +; CHECK-NEXT: mov z0.b, p0/m, w0 +; CHECK-NEXT: ret +entry: + %d = insertelement %vec, i8 %elt, i64 0 + ret %d +} + +define @insert_vscale_16_i8( %vec, i8 %elt, i64 %idx) { +; CHECK-LABEL: insert_vscale_16_i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: mov z2.b, w1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b +; CHECK-NEXT: mov z0.b, p0/m, w0 +; CHECK-NEXT: ret +entry: + %d = insertelement %vec, i8 %elt, i64 %idx + ret %d +} diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index 0481d997d24fa..5e5fdd6d31705 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -1099,5 +1099,3 @@ loop: ret: ret i32 %3 } - - diff --git a/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll new file mode 100644 index 0000000000000..0f4eec4fdfda1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll @@ -0,0 +1,469 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s + +; Inserting an element from the bottom 128-bits of an SVE type into a NEON vector should use INS (element) to +; avoid pointless FMOV trips. + +; --------- extraction from nxv16i8 + +define <8 x i8> @test_lane0_nxv16i8(<8 x i8> %a, %b) { +; CHECK-LABEL: test_lane0_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.b[0], v1.b[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <8 x i8> %a, i8 %c, i32 0 + ret <8 x i8> %d +} + +define <8 x i8> @test_lane15_nxv16i8(<8 x i8> %a, %b) { +; CHECK-LABEL: test_lane15_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.b[7], v1.b[15] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 15 + %d = insertelement <8 x i8> %a, i8 %c, i32 7 + ret <8 x i8> %d +} + +define <16 x i8> @test_q_lane0_nxv16i8(<16 x i8> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.b[0], v1.b[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <16 x i8> %a, i8 %c, i32 0 + ret <16 x i8> %d +} + +define <16 x i8> @test_q_lane15_nxv16i8(<16 x i8> %a, %b) { +; CHECK-LABEL: test_q_lane15_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.b[15], v1.b[15] +; CHECK-NEXT: ret + %c = extractelement %b, i32 15 + %d = insertelement <16 x i8> %a, i8 %c, i32 15 + ret <16 x i8> %d +} + +; (negative test) Extracted element is not within Vn +define <16 x i8> @test_q_lane16_nxv16i8(<16 x i8> %a, %b) { +; CHECK-LABEL: test_q_lane16_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.b, z1.b[16] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 16 + %d = insertelement <16 x i8> %a, i8 %c, i32 15 + ret <16 x i8> %d +} + +; --------- extraction from nxv8f16 + +define <4 x half> @test_lane0_nxv8f16(<4 x half> %a, %b) { +; CHECK-LABEL: test_lane0_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x half> %a, half %c, i32 0 + ret <4 x half> %d +} + +define <4 x half> @test_lane7_nxv8f16(<4 x half> %a, %b) { +; CHECK-LABEL: test_lane7_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[3], v1.h[7] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <4 x half> %a, half %c, i32 3 + ret <4 x half> %d +} + +define <8 x half> @test_q_lane0_nxv8f16(<8 x half> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <8 x half> %a, half %c, i32 0 + ret <8 x half> %d +} + +define <8 x half> @test_q_lane7_nxv8f16(<8 x half> %a, %b) { +; CHECK-LABEL: test_q_lane7_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[7], v1.h[7] +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <8 x half> %a, half %c, i32 7 + ret <8 x half> %d +} + +; (negative test) Extracted element is not within Vn +define <8 x half> @test_q_lane8_nxv8f16(<8 x half> %a, %b) { +; CHECK-LABEL: test_q_lane8_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, z1.h[8] +; CHECK-NEXT: mov v0.h[7], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 8 + %d = insertelement <8 x half> %a, half %c, i32 7 + ret <8 x half> %d +} + +; --------- extraction from nxv8bf16 + +define <4 x bfloat> @test_lane0_nxv8bf16(<4 x bfloat> %a, %b) { +; CHECK-LABEL: test_lane0_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x bfloat> %a, bfloat %c, i32 0 + ret <4 x bfloat> %d +} + +define <4 x bfloat> @test_lane7_nxv8bf16(<4 x bfloat> %a, %b) { +; CHECK-LABEL: test_lane7_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[3], v1.h[7] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <4 x bfloat> %a, bfloat %c, i32 3 + ret <4 x bfloat> %d +} + +define <8 x bfloat> @test_q_lane0_nxv8bf16(<8 x bfloat> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <8 x bfloat> %a, bfloat %c, i32 0 + ret <8 x bfloat> %d +} + +define <8 x bfloat> @test_q_lane7_nxv8bf16(<8 x bfloat> %a, %b) { +; CHECK-LABEL: test_q_lane7_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[7], v1.h[7] +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <8 x bfloat> %a, bfloat %c, i32 7 + ret <8 x bfloat> %d +} + +; (negative test) Extracted element is not within Vn +define <8 x bfloat> @test_q_lane8_nxv8bf16(<8 x bfloat> %a, %b) { +; CHECK-LABEL: test_q_lane8_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, z1.h[8] +; CHECK-NEXT: mov v0.h[7], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 8 + %d = insertelement <8 x bfloat> %a, bfloat %c, i32 7 + ret <8 x bfloat> %d +} + +; --------- extraction from nxv8i16 + +define <4 x i16> @test_lane0_nxv8i16(<4 x i16> %a, %b) { +; CHECK-LABEL: test_lane0_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x i16> %a, i16 %c, i32 0 + ret <4 x i16> %d +} + +define <4 x i16> @test_lane7_nxv8i16(<4 x i16> %a, %b) { +; CHECK-LABEL: test_lane7_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[3], v1.h[7] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <4 x i16> %a, i16 %c, i32 3 + ret <4 x i16> %d +} + +define <8 x i16> @test_q_lane0_nxv8i16(<8 x i16> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <8 x i16> %a, i16 %c, i32 0 + ret <8 x i16> %d +} + +define <8 x i16> @test_q_lane7_nxv8i16(<8 x i16> %a, %b) { +; CHECK-LABEL: test_q_lane7_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.h[7], v1.h[7] +; CHECK-NEXT: ret + %c = extractelement %b, i32 7 + %d = insertelement <8 x i16> %a, i16 %c, i32 7 + ret <8 x i16> %d +} + +; (negative test) Extracted element is not within Vn +define <8 x i16> @test_q_lane8_nxv8i16(<8 x i16> %a, %b) { +; CHECK-LABEL: test_q_lane8_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, z1.h[8] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.h[7], w8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 8 + %d = insertelement <8 x i16> %a, i16 %c, i32 7 + ret <8 x i16> %d +} + +; --------- extraction from nxv4f32 + +define <2 x float> @test_lane0_nxv4f32(<2 x float> %a, %b) { +; CHECK-LABEL: test_lane0_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <2 x float> %a, float %c, i32 0 + ret <2 x float> %d +} + +define <2 x float> @test_lane3_nxv4f32(<2 x float> %a, %b) { +; CHECK-LABEL: test_lane3_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], v1.s[3] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 3 + %d = insertelement <2 x float> %a, float %c, i32 1 + ret <2 x float> %d +} + +define <4 x float> @test_q_lane0_nxv4f32(<4 x float> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x float> %a, float %c, i32 0 + ret <4 x float> %d +} + +define <4 x float> @test_q_lane3_nxv4f32(<4 x float> %a, %b) { +; CHECK-LABEL: test_q_lane3_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[3], v1.s[3] +; CHECK-NEXT: ret + %c = extractelement %b, i32 3 + %d = insertelement <4 x float> %a, float %c, i32 3 + ret <4 x float> %d +} + +; (negative test) Extracted element is not within Vn +define <4 x float> @test_q_lane4_nxv4f32(<4 x float> %a, %b) { +; CHECK-LABEL: test_q_lane4_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, z1.s[4] +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 4 + %d = insertelement <4 x float> %a, float %c, i32 3 + ret <4 x float> %d +} + +; --------- extraction from nxv4i32 + +define <2 x i32> @test_lane0_nxv4i32(<2 x i32> %a, %b) { +; CHECK-LABEL: test_lane0_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <2 x i32> %a, i32 %c, i32 0 + ret <2 x i32> %d +} + +define <2 x i32> @test_lane3_nxv4i32(<2 x i32> %a, %b) { +; CHECK-LABEL: test_lane3_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], v1.s[3] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 3 + %d = insertelement <2 x i32> %a, i32 %c, i32 1 + ret <2 x i32> %d +} + +define <4 x i32> @test_q_lane0_nxv4i32(<4 x i32> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <4 x i32> %a, i32 %c, i32 0 + ret <4 x i32> %d +} + +define <4 x i32> @test_q_lane3_nxv4i32(<4 x i32> %a, %b) { +; CHECK-LABEL: test_q_lane3_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[3], v1.s[3] +; CHECK-NEXT: ret + %c = extractelement %b, i32 3 + %d = insertelement <4 x i32> %a, i32 %c, i32 3 + ret <4 x i32> %d +} + +; (negative test) Extracted element is not within Vn +define <4 x i32> @test_q_lane4_nxv4i32(<4 x i32> %a, %b) { +; CHECK-LABEL: test_q_lane4_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, z1.s[4] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 4 + %d = insertelement <4 x i32> %a, i32 %c, i32 3 + ret <4 x i32> %d +} + +; --------- extraction from nxv2f64 + +define <1 x double> @test_lane0_nxv2f64(<1 x double> %a, %b) { +; CHECK-LABEL: test_lane0_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <1 x double> %a, double %c, i32 0 + ret <1 x double> %d +} + +define <1 x double> @test_lane1_nxv2f64(<1 x double> %a, %b) { +; CHECK-LABEL: test_lane1_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement <1 x double> %a, double %c, i32 0 + ret <1 x double> %d +} + +define <2 x double> @test_q_lane0_nxv2f64(<2 x double> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <2 x double> %a, double %c, i32 0 + ret <2 x double> %d +} + +define <2 x double> @test_q_lane1_nxv2f64(<2 x double> %a, %b) { +; CHECK-LABEL: test_q_lane1_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement <2 x double> %a, double %c, i32 1 + ret <2 x double> %d +} + +; (negative test) Extracted element is not within Vn +define <2 x double> @test_q_lane2_nxv2f64(<2 x double> %a, %b) { +; CHECK-LABEL: test_q_lane2_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, z1.d[2] +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 2 + %d = insertelement <2 x double> %a, double %c, i32 1 + ret <2 x double> %d +} + +; --------- extraction from nxv2i64 + +define <1 x i64> @test_lane0_nxv2i64(<1 x i64> %a, %b) { +; CHECK-LABEL: test_lane0_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <1 x i64> %a, i64 %c, i32 0 + ret <1 x i64> %d +} + +define <1 x i64> @test_lane1_nxv2i64(<1 x i64> %a, %b) { +; CHECK-LABEL: test_lane1_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement <1 x i64> %a, i64 %c, i32 0 + ret <1 x i64> %d +} + +define <2 x i64> @test_q_lane0_nxv2i64(<2 x i64> %a, %b) { +; CHECK-LABEL: test_q_lane0_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[0], v1.d[0] +; CHECK-NEXT: ret + %c = extractelement %b, i32 0 + %d = insertelement <2 x i64> %a, i64 %c, i32 0 + ret <2 x i64> %d +} + +define <2 x i64> @test_q_lane1_nxv2i64(<2 x i64> %a, %b) { +; CHECK-LABEL: test_q_lane1_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: ret + %c = extractelement %b, i32 1 + %d = insertelement <2 x i64> %a, i64 %c, i32 1 + ret <2 x i64> %d +} + +; (negative test) Extracted element is not within Vn +define <2 x i64> @test_q_lane2_nxv2i64(<2 x i64> %a, %b) { +; CHECK-LABEL: test_q_lane2_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.d, z1.d[2] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret + %c = extractelement %b, i32 2 + %d = insertelement <2 x i64> %a, i64 %c, i32 1 + ret <2 x i64> %d +} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-basic-pic.ll b/llvm/test/CodeGen/AArch64/ptrauth-basic-pic.ll new file mode 100644 index 0000000000000..517a0a86ef14b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-basic-pic.ll @@ -0,0 +1,127 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=0 -fast-isel=0 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth -mattr=+fpac %s -o - | FileCheck %s --check-prefixes=CHECK,NOTRAP +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=0 -fast-isel=0 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth %s -o - | FileCheck %s --check-prefixes=CHECK,TRAP + +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=0 -fast-isel=1 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth -mattr=+fpac %s -o - | FileCheck %s --check-prefixes=CHECK,NOTRAP +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=0 -fast-isel=1 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth %s -o - | FileCheck %s --check-prefixes=CHECK,TRAP + +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=1 -global-isel-abort=1 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth -mattr=+fpac %s -o - | FileCheck %s --check-prefixes=CHECK,NOTRAP +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=1 -global-isel-abort=1 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth %s -o - | FileCheck %s --check-prefixes=CHECK,TRAP + +;; Note: for FastISel, we fall back to SelectionDAG + +@var = global i32 0 + +define i32 @get_globalvar() { +; CHECK-LABEL: get_globalvar: +; CHECK: adrp x17, :got_auth:var +; CHECK-NEXT: add x17, x17, :got_auth_lo12:var +; NOTRAP-NEXT: ldr x8, [x17] +; NOTRAP-NEXT: autda x8, x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_0 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_0: +; TRAP-NEXT: mov x8, x16 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret + + %val = load i32, ptr @var + ret i32 %val +} + +define ptr @get_globalvaraddr() { +; CHECK-LABEL: get_globalvaraddr: +; CHECK: adrp x17, :got_auth:var +; CHECK-NEXT: add x17, x17, :got_auth_lo12:var +; NOTRAP-NEXT: ldr x0, [x17] +; NOTRAP-NEXT: autda x0, x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_1 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_1: +; TRAP-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + %val = load i32, ptr @var + ret ptr @var +} + +declare i32 @foo() + +define ptr @resign_globalfunc() { +; CHECK-LABEL: resign_globalfunc: +; CHECK: adrp x17, :got_auth:foo +; CHECK-NEXT: add x17, x17, :got_auth_lo12:foo +; CHECK-NEXT: ldr x16, [x17] +; CHECK-NEXT: autia x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpaci x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_2 +; TRAP-NEXT: brk #0xc470 +; TRAP-NEXT: .Lauth_success_2: +; CHECK-NEXT: mov x17, #42 +; CHECK-NEXT: pacia x16, x17 +; CHECK-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + ret ptr ptrauth (ptr @foo, i32 0, i64 42) +} + +define ptr @resign_globalvar() { +; CHECK-LABEL: resign_globalvar: +; CHECK: adrp x17, :got_auth:var +; CHECK-NEXT: add x17, x17, :got_auth_lo12:var +; CHECK-NEXT: ldr x16, [x17] +; CHECK-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_3 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_3: +; CHECK-NEXT: mov x17, #43 +; CHECK-NEXT: pacdb x16, x17 +; CHECK-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + ret ptr ptrauth (ptr @var, i32 3, i64 43) +} + +define ptr @resign_globalvar_offset() { +; CHECK-LABEL: resign_globalvar_offset: +; CHECK: adrp x17, :got_auth:var +; CHECK-NEXT: add x17, x17, :got_auth_lo12:var +; CHECK-NEXT: ldr x16, [x17] +; CHECK-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_4 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_4: +; CHECK-NEXT: add x16, x16, #16 +; CHECK-NEXT: mov x17, #44 +; CHECK-NEXT: pacda x16, x17 +; CHECK-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + ret ptr ptrauth (ptr getelementptr (i8, ptr @var, i64 16), i32 2, i64 44) +} + +!llvm.module.flags = !{!0} +!0 = !{i32 8, !"ptrauth-elf-got", i32 1} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-elf-globals-pic.ll b/llvm/test/CodeGen/AArch64/ptrauth-elf-globals-pic.ll new file mode 100644 index 0000000000000..23357cd802574 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-elf-globals-pic.ll @@ -0,0 +1,46 @@ +; RUN: llc -mtriple=arm64 -global-isel=0 -fast-isel=0 -relocation-model=pic -o - %s \ +; RUN: -mcpu=cyclone -mattr=+pauth -mattr=+fpac | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=arm64 -global-isel=0 -fast-isel=0 -relocation-model=pic -o - %s \ +; RUN: -mcpu=cyclone -mattr=+pauth | FileCheck --check-prefixes=CHECK,TRAP %s + +; RUN: llc -mtriple=arm64 -global-isel=0 -fast-isel=1 -relocation-model=pic -o - %s \ +; RUN: -mcpu=cyclone -mattr=+pauth -mattr=+fpac | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=arm64 -global-isel=0 -fast-isel=1 -relocation-model=pic -o - %s \ +; RUN: -mcpu=cyclone -mattr=+pauth | FileCheck --check-prefixes=CHECK,TRAP %s + +; RUN: llc -mtriple=arm64 -global-isel=1 -global-isel-abort=1 -relocation-model=pic -o - %s \ +; RUN: -mcpu=cyclone -mattr=+pauth -mattr=+fpac | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=arm64 -global-isel=1 -global-isel-abort=1 -relocation-model=pic -o - %s \ +; RUN: -mcpu=cyclone -mattr=+pauth | FileCheck --check-prefixes=CHECK,TRAP %s + +;; Note: for FastISel, we fall back to SelectionDAG + +@var8 = external global i8, align 1 + +define i8 @test_i8(i8 %new) { + %val = load i8, ptr @var8, align 1 + store i8 %new, ptr @var8 + ret i8 %val + +; CHECK-LABEL: test_i8: +; CHECK: adrp x17, :got_auth:var8 +; CHECK-NEXT: add x17, x17, :got_auth_lo12:var8 +; NOTRAP-NEXT: ldr x9, [x17] +; NOTRAP-NEXT: autda x9, x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_0 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_0: +; TRAP-NEXT: mov x9, x16 +; CHECK-NEXT: ldrb w8, [x9] +; CHECK-NEXT: strb w0, [x9] +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret +} + +!llvm.module.flags = !{!0} +!0 = !{i32 8, !"ptrauth-elf-got", i32 1} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-elf-got-function-symbols.ll b/llvm/test/CodeGen/AArch64/ptrauth-elf-got-function-symbols.ll new file mode 100644 index 0000000000000..e75acceaa0d12 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-elf-got-function-symbols.ll @@ -0,0 +1,42 @@ +; RUN: llc -mtriple aarch64-linux-pauthtest -mattr +pauth -filetype=asm %s -o - | \ +; RUN: FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple aarch64-linux-pauthtest -mattr +pauth -filetype=obj %s -o - | \ +; RUN: llvm-readelf -s - | FileCheck %s --check-prefix=OBJ + +; ASM: .type foo,@function +; ASM-LABEL: foo: +; ASM: adrp x17, :got_auth:bar +; ASM-NEXT: add x17, x17, :got_auth_lo12:bar +; ASM-NEXT: ldr x16, [x17] +; ASM-NEXT: autia x16, x17 +; ASM-NEXT: mov x17, x16 +; ASM-NEXT: xpaci x17 +; ASM-NEXT: cmp x16, x17 +; ASM-NEXT: b.eq .Lauth_success_0 +; ASM-NEXT: brk #0xc470 +; ASM-NEXT: .Lauth_success_0: +; ASM-NEXT: paciza x16 +; ASM-NEXT: adrp x8, .Lfptr +; ASM-NEXT: str x16, [x8, :lo12:.Lfptr] +; ASM-NEXT: ret +; ASM: .type .Lfptr,@object +; ASM-NEXT: .local .Lfptr +; ASM-NEXT: .comm .Lfptr,8,8 +; ASM: .type bar,@function + +; OBJ: Symbol table '.symtab' contains [[#]] entries: +; OBJ-NEXT: Num: Value Size Type Bind Vis Ndx Name +; OBJ: 0000000000000000 0 FUNC GLOBAL DEFAULT UND bar + +@fptr = private global ptr null + +define void @foo() { + store ptr ptrauth (ptr @bar, i32 0), ptr @fptr + ret void +} + +declare i32 @bar() + +!llvm.module.flags = !{!0} + +!0 = !{i32 8, !"ptrauth-elf-got", i32 1} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-extern-weak.ll b/llvm/test/CodeGen/AArch64/ptrauth-extern-weak.ll new file mode 100644 index 0000000000000..5d0a3f556c4c2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-extern-weak.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=0 -fast-isel=0 -relocation-model=pic \ +; RUN: -mattr=+pauth -mattr=+fpac -o - %s | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=0 -fast-isel=0 -relocation-model=pic \ +; RUN: -mattr=+pauth -o - %s | FileCheck --check-prefixes=CHECK,TRAP %s + +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=0 -fast-isel=1 -relocation-model=pic \ +; RUN: -mattr=+pauth -mattr=+fpac -o - %s | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=0 -fast-isel=1 -relocation-model=pic \ +; RUN: -mattr=+pauth -o - %s | FileCheck --check-prefixes=CHECK,TRAP %s + +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=1 -global-isel-abort=1 -relocation-model=pic \ +; RUN: -mattr=+pauth -mattr=+fpac -o - %s | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=1 -global-isel-abort=1 -relocation-model=pic \ +; RUN: -mattr=+pauth -o - %s | FileCheck --check-prefixes=CHECK,TRAP %s + +;; Note: for FastISel, we fall back to SelectionDAG + +declare extern_weak dso_local i32 @var() + +define ptr @foo() { +; The usual ADRP/ADD pair can't be used for a weak reference because it must +; evaluate to 0 if the symbol is undefined. We use a GOT entry for PIC +; otherwise a litpool entry. + ret ptr @var + +; CHECK-LABEL: foo: +; CHECK: adrp x17, :got_auth:var +; CHECK-NEXT: add x17, x17, :got_auth_lo12:var +; NOTRAP-NEXT: ldr x0, [x17] +; NOTRAP-NEXT: cbz x0, .Lundef_weak0 +; NOTRAP-NEXT: autia x0, x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: cbz x16, .Lundef_weak0 +; TRAP-NEXT: autia x16, x17 +; CHECK-NEXT: .Lundef_weak0: +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpaci x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_0 +; TRAP-NEXT: brk #0xc470 +; TRAP-NEXT: .Lauth_success_0: +; TRAP-NEXT: mov x0, x16 +; CHECK-NEXT: ret +} + +@arr_var = extern_weak global [10 x i32] + +define ptr @bar() { + %addr = getelementptr [10 x i32], ptr @arr_var, i32 0, i32 5 + ret ptr %addr + +; CHECK-LABEL: bar: +; CHECK: adrp x17, :got_auth:arr_var +; CHECK-NEXT: add x17, x17, :got_auth_lo12:arr_var +; NOTRAP-NEXT: ldr x8, [x17] +; NOTRAP-NEXT: cbz x8, .Lundef_weak1 +; NOTRAP-NEXT: autda x8, x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: cbz x16, .Lundef_weak1 +; TRAP-NEXT: autda x16, x17 +; CHECK-NEXT: .Lundef_weak1: +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_1 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_1: +; TRAP-NEXT: mov x8, x16 +; CHECK-NEXT: add x0, x8, #20 +; CHECK-NEXT: ret +} + +!llvm.module.flags = !{!0} +!0 = !{i32 8, !"ptrauth-elf-got", i32 1} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-got-abuse.ll b/llvm/test/CodeGen/AArch64/ptrauth-got-abuse.ll new file mode 100644 index 0000000000000..3c4747c321856 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-got-abuse.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=0 \ +; RUN: -relocation-model=pic -mattr=+pauth -mattr=+fpac -o - %s | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=0 \ +; RUN: -relocation-model=pic -mattr=+pauth -o - %s | FileCheck --check-prefixes=CHECK,TRAP %s + +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=1 \ +; RUN: -relocation-model=pic -mattr=+pauth -mattr=+fpac -o - %s | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=1 \ +; RUN: -relocation-model=pic -mattr=+pauth -o - %s | FileCheck --check-prefixes=CHECK,TRAP %s + +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=1 -global-isel-abort=1 \ +; RUN: -relocation-model=pic -mattr=+pauth -mattr=+fpac -o - %s | FileCheck --check-prefixes=CHECK,NOTRAP %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=1 -global-isel-abort=1 \ +; RUN: -relocation-model=pic -mattr=+pauth -o - %s | FileCheck --check-prefixes=CHECK,TRAP %s + +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=0 \ +; RUN: -relocation-model=pic -filetype=obj -mattr=+pauth -o /dev/null %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=1 \ +; RUN: -relocation-model=pic -filetype=obj -mattr=+pauth -o /dev/null %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=1 -global-isel-abort=1 \ +; RUN: -relocation-model=pic -filetype=obj -mattr=+pauth -o /dev/null %s + +;; Note: for FastISel, we fall back to SelectionDAG + +declare void @consume(i32) +declare void @func() + +define void @aliasee_func() { + ret void +} +@alias_func = alias void (), ptr @aliasee_func + +@aliasee_global = global i32 42 +@alias_global = alias i32, ptr @aliasee_global + +define void @foo() nounwind { +; CHECK-LABEL: foo: +entry: + call void @consume(i32 ptrtoint (ptr @func to i32)) +; CHECK: adrp x17, :got_auth:func +; CHECK-NEXT: add x17, x17, :got_auth_lo12:func +; NOTRAP-NEXT: ldr x[[TMP0:[0-9]+]], [x17] +; NOTRAP-NEXT: autia x[[TMP0]], x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autia x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpaci x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_0 +; TRAP-NEXT: brk #0xc470 +; TRAP-NEXT: .Lauth_success_0: +; TRAP-NEXT: mov x[[TMP0:[0-9]+]], x16 + + call void @consume(i32 ptrtoint (ptr @alias_func to i32)) +; CHECK: adrp x17, :got_auth:alias_func +; CHECK-NEXT: add x17, x17, :got_auth_lo12:alias_func +; NOTRAP-NEXT: ldr x[[TMP1:[0-9]+]], [x17] +; NOTRAP-NEXT: autia x[[TMP1]], x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autia x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpaci x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_1 +; TRAP-NEXT: brk #0xc470 +; TRAP-NEXT: .Lauth_success_1: +; TRAP-NEXT: mov x[[TMP1:[0-9]+]], x16 + + call void @consume(i32 ptrtoint (ptr @alias_global to i32)) +; CHECK: adrp x17, :got_auth:alias_global +; CHECK-NEXT: add x17, x17, :got_auth_lo12:alias_global +; NOTRAP-NEXT: ldr x[[TMP2:[0-9]+]], [x17] +; NOTRAP-NEXT: autda x[[TMP2]], x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_2 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_2: +; TRAP-NEXT: mov x[[TMP2:[0-9]+]], x16 + + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 8, !"ptrauth-elf-got", i32 1} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-tagged-globals-pic.ll b/llvm/test/CodeGen/AArch64/ptrauth-tagged-globals-pic.ll new file mode 100644 index 0000000000000..ee34b439daec7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-tagged-globals-pic.ll @@ -0,0 +1,117 @@ +; RUN: llc -global-isel=0 -fast-isel=0 -O0 --relocation-model=pic < %s \ +; RUN: -mattr=+pauth -mattr=+fpac | FileCheck %s --check-prefixes=CHECK,DAGISEL,NOTRAP,DAGISEL-NOTRAP +; RUN: llc -global-isel=0 -fast-isel=0 -O0 --relocation-model=pic < %s \ +; RUN: -mattr=+pauth | FileCheck %s --check-prefixes=CHECK,DAGISEL,TRAP,DAGISEL-TRAP + +; RUN: llc -global-isel=0 -fast-isel=1 -O0 --relocation-model=pic < %s \ +; RUN: -mattr=+pauth -mattr=+fpac | FileCheck %s --check-prefixes=CHECK,DAGISEL,NOTRAP,DAGISEL-NOTRAP +; RUN: llc -global-isel=0 -fast-isel=1 -O0 --relocation-model=pic < %s \ +; RUN: -mattr=+pauth | FileCheck %s --check-prefixes=CHECK,DAGISEL,TRAP,DAGISEL-TRAP + +; RUN: llc -global-isel=1 -global-isel-abort=1 -O0 --relocation-model=pic < %s \ +; RUN: -mattr=+pauth -mattr=+fpac | FileCheck %s --check-prefixes=CHECK,GISEL,NOTRAP,GISEL-NOTRAP +; RUN: llc -global-isel=1 -global-isel-abort=1 -O0 --relocation-model=pic < %s \ +; RUN: -mattr=+pauth | FileCheck %s --check-prefixes=CHECK,GISEL,TRAP,GISEL-TRAP + +;; Note: for FastISel, we fall back to SelectionDAG + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-android" + +@global = external global i32 +declare void @func() + +define ptr @global_addr() #0 { +; CHECK-LABEL: global_addr: +; CHECK: adrp x17, :got_auth:global +; CHECK-NEXT: add x17, x17, :got_auth_lo12:global +; NOTRAP-NEXT: ldr x0, [x17] +; NOTRAP-NEXT: autda x0, x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_0 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_0: +; TRAP-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + ret ptr @global +} + +define i32 @global_load() #0 { +; CHECK-LABEL: global_load: +; CHECK: adrp x17, :got_auth:global +; CHECK-NEXT: add x17, x17, :got_auth_lo12:global +; NOTRAP-NEXT: ldr x8, [x17] +; NOTRAP-NEXT: autda x8, x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_1 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_1: +; TRAP-NEXT: mov x8, x16 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret + + %load = load i32, ptr @global + ret i32 %load +} + +define void @global_store() #0 { +; CHECK-LABEL: global_store: +; CHECK: adrp x17, :got_auth:global +; CHECK-NEXT: add x17, x17, :got_auth_lo12:global +; GISEL-NOTRAP-NEXT: ldr x8, [x17] +; GISEL-NOTRAP-NEXT: autda x8, x17 +; GISEL-TRAP-NEXT: ldr x16, [x17] +; GISEL-TRAP-NEXT: autda x16, x17 +; DAGISEL-NOTRAP-NEXT: ldr x9, [x17] +; DAGISEL-NOTRAP-NEXT: autda x9, x17 +; DAGISEL-TRAP-NEXT: ldr x16, [x17] +; DAGISEL-TRAP-NEXT: autda x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpacd x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_2 +; TRAP-NEXT: brk #0xc472 +; TRAP-NEXT: .Lauth_success_2: +; GISEL-TRAP-NEXT: mov x8, x16 +; DAGISEL-TRAP-NEXT: mov x9, x16 +; GISEL-NEXT: str wzr, [x8] +; DAGISEL-NEXT: mov w8, wzr +; DAGISEL-NEXT: str w8, [x9] +; CHECK-NEXT: ret + store i32 0, ptr @global + ret void +} + +define ptr @func_addr() #0 { +; CHECK-LABEL: func_addr: +; CHECK: adrp x17, :got_auth:func +; CHECK-NEXT: add x17, x17, :got_auth_lo12:func +; NOTRAP-NEXT: ldr x0, [x17] +; NOTRAP-NEXT: autia x0, x17 +; TRAP-NEXT: ldr x16, [x17] +; TRAP-NEXT: autia x16, x17 +; TRAP-NEXT: mov x17, x16 +; TRAP-NEXT: xpaci x17 +; TRAP-NEXT: cmp x16, x17 +; TRAP-NEXT: b.eq .Lauth_success_3 +; TRAP-NEXT: brk #0xc470 +; TRAP-NEXT: .Lauth_success_3: +; TRAP-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + ret ptr @func +} + +attributes #0 = { "target-features"="+tagged-globals" } + +!llvm.module.flags = !{!0} +!0 = !{i32 8, !"ptrauth-elf-got", i32 1} diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index 678afc4dea309..518e3573b5edd 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -290,41 +290,28 @@ define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { ret <8 x i1> %mask } +; TODO: Apply better reasoning when lowering extract_subvector from the bottom 128-bits +; of an SVE type. define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { ; CHECK-LABEL: extract_v16i1_nxv16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.b[2] -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: umov w8, v1.b[3] -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: umov w9, v1.b[4] -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: umov w8, v1.b[5] -; CHECK-NEXT: mov v0.b[4], w9 -; CHECK-NEXT: umov w9, v1.b[6] -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: umov w8, v1.b[7] -; CHECK-NEXT: mov v0.b[6], w9 -; CHECK-NEXT: umov w9, v1.b[8] -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: umov w8, v1.b[9] -; CHECK-NEXT: mov v0.b[8], w9 -; CHECK-NEXT: umov w9, v1.b[10] -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: umov w8, v1.b[11] -; CHECK-NEXT: mov v0.b[10], w9 -; CHECK-NEXT: umov w9, v1.b[12] -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: umov w8, v1.b[13] -; CHECK-NEXT: mov v0.b[12], w9 -; CHECK-NEXT: umov w9, v1.b[14] -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: umov w8, v1.b[15] -; CHECK-NEXT: mov v0.b[14], w9 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: mov v0.b[1], v1.b[1] +; CHECK-NEXT: mov v0.b[2], v1.b[2] +; CHECK-NEXT: mov v0.b[3], v1.b[3] +; CHECK-NEXT: mov v0.b[4], v1.b[4] +; CHECK-NEXT: mov v0.b[5], v1.b[5] +; CHECK-NEXT: mov v0.b[6], v1.b[6] +; CHECK-NEXT: mov v0.b[7], v1.b[7] +; CHECK-NEXT: mov v0.b[8], v1.b[8] +; CHECK-NEXT: mov v0.b[9], v1.b[9] +; CHECK-NEXT: mov v0.b[10], v1.b[10] +; CHECK-NEXT: mov v0.b[11], v1.b[11] +; CHECK-NEXT: mov v0.b[12], v1.b[12] +; CHECK-NEXT: mov v0.b[13], v1.b[13] +; CHECK-NEXT: mov v0.b[14], v1.b[14] +; CHECK-NEXT: mov v0.b[15], v1.b[15] ; CHECK-NEXT: ret %mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1( %inmask, i64 0) ret <16 x i1> %mask diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index fb169491b0c90..749a1866e7192 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -30,78 +30,64 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: // %bb.1: // %vector.body ; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov x9, #8 // =0x8 +; CHECK-NEXT: mov x10, #24 // =0x18 ; CHECK-NEXT: umov w8, v0.b[8] -; CHECK-NEXT: umov w9, v0.b[9] -; CHECK-NEXT: umov w10, v0.b[1] ; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: umov w11, v0.b[15] +; CHECK-NEXT: mov v1.b[1], v0.b[1] ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: umov w8, v0.b[10] -; CHECK-NEXT: mov v1.b[1], w10 -; CHECK-NEXT: umov w10, v0.b[11] -; CHECK-NEXT: mov v2.b[1], w9 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: mov v2.b[2], w8 -; CHECK-NEXT: umov w8, v0.b[3] -; CHECK-NEXT: mov v1.b[2], w9 -; CHECK-NEXT: umov w9, v0.b[12] -; CHECK-NEXT: mov v2.b[3], w10 -; CHECK-NEXT: umov w10, v0.b[4] -; CHECK-NEXT: mov v1.b[3], w8 -; CHECK-NEXT: umov w8, v0.b[13] -; CHECK-NEXT: mov v2.b[4], w9 -; CHECK-NEXT: umov w9, v0.b[5] -; CHECK-NEXT: mov v1.b[4], w10 -; CHECK-NEXT: umov w10, v0.b[14] -; CHECK-NEXT: mov v2.b[5], w8 -; CHECK-NEXT: umov w8, v0.b[6] -; CHECK-NEXT: mov v1.b[5], w9 -; CHECK-NEXT: umov w9, v0.b[7] +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: mov v2.b[1], v0.b[9] +; CHECK-NEXT: mov v1.b[2], v0.b[2] +; CHECK-NEXT: mov v2.b[2], v0.b[10] +; CHECK-NEXT: mov v1.b[3], v0.b[3] +; CHECK-NEXT: mov v2.b[3], v0.b[11] +; CHECK-NEXT: mov v1.b[4], v0.b[4] +; CHECK-NEXT: mov v2.b[4], v0.b[12] +; CHECK-NEXT: mov v1.b[5], v0.b[5] +; CHECK-NEXT: mov v2.b[5], v0.b[13] +; CHECK-NEXT: mov v1.b[6], v0.b[6] +; CHECK-NEXT: mov v2.b[6], v0.b[14] +; CHECK-NEXT: mov v1.b[7], v0.b[7] +; CHECK-NEXT: mov v2.b[7], v0.b[15] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 -; CHECK-NEXT: mov v2.b[6], w10 -; CHECK-NEXT: mov v1.b[6], w8 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: mov x8, #16 // =0x10 -; CHECK-NEXT: mov x10, #8 // =0x8 -; CHECK-NEXT: mov v2.b[7], w11 -; CHECK-NEXT: mov v1.b[7], w9 +; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: mov x9, #24 // =0x18 -; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: lsl z3.s, z3.s, #31 -; CHECK-NEXT: asr z0.s, z0.s, #31 -; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: lsl z1.s, z1.s, #31 -; CHECK-NEXT: and z0.s, z0.s, #0x1 -; CHECK-NEXT: and z3.s, z3.s, #0x1 -; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 -; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2] -; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 -; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, #0 +; CHECK-NEXT: lsl z3.s, z3.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: and z0.s, z0.s, #0x1 ; CHECK-NEXT: cmpne p4.s, p0/z, z1.s, #0 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: mov z3.s, p2/m, #0 // =0x0 -; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; CHECK-NEXT: mov z2.s, p3/m, #0 // =0x0 +; CHECK-NEXT: asr z3.s, z3.s, #31 +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: and z3.s, z3.s, #0x1 +; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: mov z1.s, p4/m, #0 // =0x0 -; CHECK-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; CHECK-NEXT: cmpne p3.s, p0/z, z3.s, #0 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z2.s, p2/m, #0 // =0x0 ; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 +; CHECK-NEXT: st1w { z2.s }, p0, [x0, x9, lsl #2] +; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll index 6017e13ce0035..ce440d3095d3f 100644 --- a/llvm/test/CodeGen/AArch64/sve-hadd.ll +++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll @@ -1301,3 +1301,43 @@ entry: %result = trunc %s to ret %result } + +define @haddu_v2i64_add( %s0, %s1) { +; SVE-LABEL: haddu_v2i64_add: +; SVE: // %bb.0: // %entry +; SVE-NEXT: eor z2.d, z0.d, z1.d +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: lsr z1.d, z2.d, #1 +; SVE-NEXT: add z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: haddu_v2i64_add: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: uhadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret +entry: + %add = add nuw nsw %s0, %s1 + %avg = lshr %add, splat (i64 1) + ret %avg +} + +define @hadds_v2i64_add( %s0, %s1) { +; SVE-LABEL: hadds_v2i64_add: +; SVE: // %bb.0: // %entry +; SVE-NEXT: eor z2.d, z0.d, z1.d +; SVE-NEXT: and z0.d, z0.d, z1.d +; SVE-NEXT: asr z1.d, z2.d, #1 +; SVE-NEXT: add z0.d, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: hadds_v2i64_add: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: shadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret +entry: + %add = add nuw nsw %s0, %s1 + %avg = ashr %add, splat (i64 1) + ret %avg +} diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll index 7d81ebaefddb8..fec255b712441 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,SVE +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2 ; ; CLASTA (Vectors) @@ -570,13 +571,21 @@ define @dupq_lane_f64( %a, i64 %idx) ; NOTE: Index out of range (0-3) define @dupq_i64_range( %a) { -; CHECK-LABEL: dupq_i64_range: -; CHECK: // %bb.0: -; CHECK-NEXT: index z1.d, #0, #1 -; CHECK-NEXT: and z1.d, z1.d, #0x1 -; CHECK-NEXT: orr z1.d, z1.d, #0x8 -; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: dupq_i64_range: +; SVE: // %bb.0: +; SVE-NEXT: index z1.d, #0, #1 +; SVE-NEXT: and z1.d, z1.d, #0x1 +; SVE-NEXT: orr z1.d, z1.d, #0x8 +; SVE-NEXT: tbl z0.d, { z0.d }, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: dupq_i64_range: +; SVE2: // %bb.0: +; SVE2-NEXT: index z1.d, #0, #1 +; SVE2-NEXT: and z1.d, z1.d, #0x1 +; SVE2-NEXT: add z1.d, z1.d, #8 // =0x8 +; SVE2-NEXT: tbl z0.d, { z0.d }, z1.d +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.dupq.lane.nxv2i64( %a, i64 4) ret %out } @@ -1082,10 +1091,17 @@ define @rev_f64( %a) { ; define @splice_i8( %pg, %a, %b) { -; CHECK-LABEL: splice_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: splice_i8: +; SVE: // %bb.0: +; SVE-NEXT: splice z0.b, p0, z0.b, z1.b +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_i8: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: splice z0.b, p0, { z0.b, z1.b } +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.splice.nxv16i8( %pg, %a, %b) @@ -1093,10 +1109,17 @@ define @splice_i8( %pg, } define @splice_i16( %pg, %a, %b) { -; CHECK-LABEL: splice_i16: -; CHECK: // %bb.0: -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: splice_i16: +; SVE: // %bb.0: +; SVE-NEXT: splice z0.h, p0, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_i16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: splice z0.h, p0, { z0.h, z1.h } +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.splice.nxv8i16( %pg, %a, %b) @@ -1104,10 +1127,17 @@ define @splice_i16( %pg, } define @splice_i32( %pg, %a, %b) { -; CHECK-LABEL: splice_i32: -; CHECK: // %bb.0: -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: splice_i32: +; SVE: // %bb.0: +; SVE-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_i32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: splice z0.s, p0, { z0.s, z1.s } +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.splice.nxv4i32( %pg, %a, %b) @@ -1115,10 +1145,17 @@ define @splice_i32( %pg, } define @splice_i64( %pg, %a, %b) { -; CHECK-LABEL: splice_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: splice_i64: +; SVE: // %bb.0: +; SVE-NEXT: splice z0.d, p0, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: splice z0.d, p0, { z0.d, z1.d } +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.splice.nxv2i64( %pg, %a, %b) @@ -1126,10 +1163,17 @@ define @splice_i64( %pg, } define @splice_bf16( %pg, %a, %b) #0 { -; CHECK-LABEL: splice_bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: splice_bf16: +; SVE: // %bb.0: +; SVE-NEXT: splice z0.h, p0, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_bf16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: splice z0.h, p0, { z0.h, z1.h } +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.splice.nxv8bf16( %pg, %a, %b) @@ -1137,10 +1181,17 @@ define @splice_bf16( %pg, @splice_f16( %pg, %a, %b) { -; CHECK-LABEL: splice_f16: -; CHECK: // %bb.0: -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: splice_f16: +; SVE: // %bb.0: +; SVE-NEXT: splice z0.h, p0, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_f16: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: splice z0.h, p0, { z0.h, z1.h } +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.splice.nxv8f16( %pg, %a, %b) @@ -1148,10 +1199,17 @@ define @splice_f16( %pg, @splice_f32( %pg, %a, %b) { -; CHECK-LABEL: splice_f32: -; CHECK: // %bb.0: -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: splice_f32: +; SVE: // %bb.0: +; SVE-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_f32: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: splice z0.s, p0, { z0.s, z1.s } +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.splice.nxv4f32( %pg, %a, %b) @@ -1159,10 +1217,17 @@ define @splice_f32( %pg, @splice_f64( %pg, %a, %b) { -; CHECK-LABEL: splice_f64: -; CHECK: // %bb.0: -; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: splice_f64: +; SVE: // %bb.0: +; SVE-NEXT: splice z0.d, p0, z0.d, z1.d +; SVE-NEXT: ret +; +; SVE2-LABEL: splice_f64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; SVE2-NEXT: splice z0.d, p0, { z0.d, z1.d } +; SVE2-NEXT: ret %out = call @llvm.aarch64.sve.splice.nxv2f64( %pg, %a, %b) diff --git a/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll b/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll new file mode 100644 index 0000000000000..e18ac46165d2e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "(bl|ptrue)" --version 5 +; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s -check-prefix=SLEEF +; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s -check-prefix=ARMPL + +define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; SLEEF-LABEL: test_sincos_v4f32: +; SLEEF: bl _ZGVnN4vl4l4_sincosf +; +; ARMPL-LABEL: test_sincos_v4f32: +; ARMPL: bl armpl_vsincosq_f32 + %result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x) + %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0 + %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1 + store <4 x float> %result.0, ptr %out_sin, align 4 + store <4 x float> %result.1, ptr %out_cos, align 4 + ret void +} + +define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; SLEEF-LABEL: test_sincos_v2f64: +; SLEEF: bl _ZGVnN2vl8l8_sincos +; +; ARMPL-LABEL: test_sincos_v2f64: +; ARMPL: bl armpl_vsincosq_f64 + %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x) + %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0 + %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1 + store <2 x double> %result.0, ptr %out_sin, align 8 + store <2 x double> %result.1, ptr %out_cos, align 8 + ret void +} + +define void @test_sincos_nxv4f32( %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; SLEEF-LABEL: test_sincos_nxv4f32: +; SLEEF: bl _ZGVsNxvl4l4_sincosf +; +; ARMPL-LABEL: test_sincos_nxv4f32: +; ARMPL: ptrue p0.s +; ARMPL: bl armpl_svsincos_f32_x + %result = call { , } @llvm.sincos.nxv4f32( %x) + %result.0 = extractvalue { , } %result, 0 + %result.1 = extractvalue { , } %result, 1 + store %result.0, ptr %out_sin, align 4 + store %result.1, ptr %out_cos, align 4 + ret void +} + +define void @test_sincos_nxv2f64( %x, ptr noalias %out_sin, ptr noalias %out_cos) { +; SLEEF-LABEL: test_sincos_nxv2f64: +; SLEEF: bl _ZGVsNxvl8l8_sincos +; +; ARMPL-LABEL: test_sincos_nxv2f64: +; ARMPL: ptrue p0.d +; ARMPL: bl armpl_svsincos_f64_x + %result = call { , } @llvm.sincos.nxv2f64( %x) + %result.0 = extractvalue { , } %result, 0 + %result.1 = extractvalue { , } %result, 1 + store %result.0, ptr %out_sin, align 8 + store %result.1, ptr %out_cos, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll new file mode 100644 index 0000000000000..83912b1e77db2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll @@ -0,0 +1,271 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +define i8 @atomic_load_flat_monotonic_i8(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr %ptr monotonic, align 1 + ret i8 %load +} + +define i32 @atomic_load_flat_monotonic_i8_zext_to_i32(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i8_zext_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr %ptr monotonic, align 1 + %ext = zext i8 %load to i32 + ret i32 %ext +} + +define i32 @atomic_load_flat_monotonic_i8_sext_to_i32(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i8_sext_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_sbyte v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr %ptr monotonic, align 1 + %ext = sext i8 %load to i32 + ret i32 %ext +} + +define i16 @atomic_load_flat_monotonic_i8_zext_to_i16(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i8_zext_to_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr %ptr monotonic, align 1 + %ext = zext i8 %load to i16 + ret i16 %ext +} + +define i16 @atomic_load_flat_monotonic_i8_sext_to_i16(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i8_sext_to_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_sbyte v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr %ptr monotonic, align 1 + %ext = sext i8 %load to i16 + ret i16 %ext +} + +define i16 @atomic_load_flat_monotonic_i16(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr %ptr monotonic, align 2 + ret i16 %load +} + +define i32 @atomic_load_flat_monotonic_i16_zext_to_i32(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i16_zext_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + ret i32 %ext +} + +define i32 @atomic_load_flat_monotonic_i16_sext_to_i32(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i16_sext_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_sshort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr %ptr monotonic, align 2 + %ext = sext i16 %load to i32 + ret i32 %ext +} + +define half @atomic_load_flat_monotonic_f16(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic half, ptr %ptr monotonic, align 2 + ret half %load +} + +define bfloat @atomic_load_flat_monotonic_bf16(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic bfloat, ptr %ptr monotonic, align 2 + ret bfloat %load +} + +define i32 @atomic_load_flat_monotonic_f16_zext_to_i32(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_f16_zext_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic half, ptr %ptr monotonic, align 2 + %cast = bitcast half %load to i16 + %ext = zext i16 %cast to i32 + ret i32 %ext +} + +define i32 @atomic_load_flat_monotonic_bf16_zext_to_i32(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic bfloat, ptr %ptr monotonic, align 2 + %cast = bitcast bfloat %load to i16 + %ext = zext i16 %cast to i32 + ret i32 %ext +} + +define i32 @atomic_load_flat_monotonic_i16_d16_hi_shift(ptr %ptr) { +; GCN-LABEL: atomic_load_flat_monotonic_i16_d16_hi_shift: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_ushort v0, v[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + %shl = shl i32 %ext, 16 + ret i32 %shl +} + +define <2 x i16> @atomic_load_flat_monotonic_i16_d16_hi_vector_insert(ptr %ptr, <2 x i16> %vec) { +; GFX7-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr %ptr monotonic, align 2 + %insert = insertelement <2 x i16> %vec, i16 %load, i32 1 + ret <2 x i16> %insert +} + +define i32 @atomic_load_flat_monotonic_i16_d16_lo_or(ptr %ptr, i16 %high) { +; GFX7-LABEL: atomic_load_flat_monotonic_i16_d16_lo_or: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_flat_monotonic_i16_d16_lo_or: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_lo_or: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + %high.ext = zext i16 %high to i32 + %shl = shl i32 %high.ext, 16 + %or = or i32 %shl, %ext + ret i32 %or +} + +define <2 x i16> @atomic_load_flat_monotonic_i16_d16_lo_vector_insert(ptr %ptr, <2 x i16> %vec) { +; GFX7-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr %ptr monotonic, align 2 + %insert = insertelement <2 x i16> %vec, i16 %load, i32 0 + ret <2 x i16> %insert +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll new file mode 100644 index 0000000000000..e2906c3d4fdb2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll @@ -0,0 +1,635 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +define i8 @atomic_load_global_monotonic_i8(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(1) %ptr monotonic, align 1 + ret i8 %load +} + +define i32 @atomic_load_global_monotonic_i8_zext_to_i32(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i8_zext_to_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i8_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i8_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i8_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(1) %ptr monotonic, align 1 + %ext = zext i8 %load to i32 + ret i32 %ext +} + +define i32 @atomic_load_global_monotonic_i8_sext_to_i32(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i8_sext_to_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_sbyte v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i8_sext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_sbyte v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i8_sext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_sbyte v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i8_sext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(1) %ptr monotonic, align 1 + %ext = sext i8 %load to i32 + ret i32 %ext +} + +define i16 @atomic_load_global_monotonic_i8_zext_to_i16(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i8_zext_to_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i8_zext_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i8_zext_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i8_zext_to_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(1) %ptr monotonic, align 1 + %ext = zext i8 %load to i16 + ret i16 %ext +} + +define i16 @atomic_load_global_monotonic_i8_sext_to_i16(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i8_sext_to_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_sbyte v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i8_sext_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_sbyte v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i8_sext_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_sbyte v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i8_sext_to_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(1) %ptr monotonic, align 1 + %ext = sext i8 %load to i16 + ret i16 %ext +} + +define i16 @atomic_load_global_monotonic_i16(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 + ret i16 %load +} + +define i32 @atomic_load_global_monotonic_i16_zext_to_i32(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i16_zext_to_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i16_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i16_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i16_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + ret i32 %ext +} + +define i32 @atomic_load_global_monotonic_i16_sext_to_i32(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i16_sext_to_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_sbyte v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i16_sext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_sshort v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i16_sext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_sshort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i16_sext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_sshort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 + %ext = sext i16 %load to i32 + ret i32 %ext +} + +define half @atomic_load_global_monotonic_f16(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic half, ptr addrspace(1) %ptr monotonic, align 2 + ret half %load +} + +define bfloat @atomic_load_global_monotonic_bf16(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_bf16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic bfloat, ptr addrspace(1) %ptr monotonic, align 2 + ret bfloat %load +} + +define i32 @atomic_load_global_monotonic_f16_zext_to_i32(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_f16_zext_to_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_f16_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_f16_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_f16_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic half, ptr addrspace(1) %ptr monotonic, align 2 + %cast = bitcast half %load to i16 + %ext = zext i16 %cast to i32 + ret i32 %ext +} + +define i32 @atomic_load_global_monotonic_bf16_zext_to_i32(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_bf16_zext_to_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_bf16_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_bf16_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_bf16_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic bfloat, ptr addrspace(1) %ptr monotonic, align 2 + %cast = bitcast bfloat %load to i16 + %ext = zext i16 %cast to i32 + ret i32 %ext +} + +define i32 @atomic_load_global_monotonic_i16_d16_hi_shift(ptr addrspace(1) %ptr) { +; GFX6-LABEL: atomic_load_global_monotonic_i16_d16_hi_shift: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i16_d16_hi_shift: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i16_d16_hi_shift: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_hi_shift: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + %shl = shl i32 %ext, 16 + ret i32 %shl +} + +define <2 x i16> @atomic_load_global_monotonic_i16_d16_hi_vector_insert(ptr addrspace(1) %ptr, <2 x i16> %vec) { +; GFX6-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 + %insert = insertelement <2 x i16> %vec, i16 %load, i32 1 + ret <2 x i16> %insert +} + +define i32 @atomic_load_global_monotonic_i16_d16_lo_or(ptr addrspace(1) %ptr, i16 %high) { +; GFX6-LABEL: atomic_load_global_monotonic_i16_d16_lo_or: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i16_d16_lo_or: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i16_d16_lo_or: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_lo_or: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + %high.ext = zext i16 %high to i32 + %shl = shl i32 %high.ext, 16 + %or = or i32 %shl, %ext + ret i32 %or +} + +define <2 x i16> @atomic_load_global_monotonic_i16_d16_lo_vector_insert(ptr addrspace(1) %ptr, <2 x i16> %vec) { +; GFX6-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2 + %insert = insertelement <2 x i16> %vec, i16 %load, i32 0 + ret <2 x i16> %insert +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll new file mode 100644 index 0000000000000..1656814d6fb06 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll @@ -0,0 +1,491 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; TODO: Merge with atomic_load_local.ll + +define i8 @atomic_load_local_monotonic_i8(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u8 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u8 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1 + ret i8 %load +} + +define i32 @atomic_load_local_monotonic_i8_zext_to_i32(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i8_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u8 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i8_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u8 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i8_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1 + %ext = zext i8 %load to i32 + ret i32 %ext +} + +define i32 @atomic_load_local_monotonic_i8_sext_to_i32(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i8_sext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_i8 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i8_sext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_i8 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i8_sext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_i8 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1 + %ext = sext i8 %load to i32 + ret i32 %ext +} + +define i16 @atomic_load_local_monotonic_i8_zext_to_i16(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i8_zext_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u8 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i8_zext_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u8 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i8_zext_to_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1 + %ext = zext i8 %load to i16 + ret i16 %ext +} + +define i16 @atomic_load_local_monotonic_i8_sext_to_i16(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i8_sext_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_i8 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i8_sext_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_i8 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i8_sext_to_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_i8 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1 + %ext = sext i8 %load to i16 + ret i16 %ext +} + +define i16 @atomic_load_local_monotonic_i16(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 + ret i16 %load +} + +define i32 @atomic_load_local_monotonic_i16_zext_to_i32(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i16_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i16_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i16_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + ret i32 %ext +} + +define i32 @atomic_load_local_monotonic_i16_sext_to_i32(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i16_sext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_i16 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i16_sext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_i16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i16_sext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_i16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 + %ext = sext i16 %load to i32 + ret i32 %ext +} + +define half @atomic_load_local_monotonic_f16(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2 + ret half %load +} + +define bfloat @atomic_load_local_monotonic_bf16(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2 + ret bfloat %load +} + +define i32 @atomic_load_local_monotonic_f16_zext_to_i32(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_f16_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_f16_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_f16_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2 + %cast = bitcast half %load to i16 + %ext = zext i16 %cast to i32 + ret i32 %ext +} + +define i32 @atomic_load_local_monotonic_bf16_zext_to_i32(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_bf16_zext_to_i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_bf16_zext_to_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_bf16_zext_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2 + %cast = bitcast bfloat %load to i16 + %ext = zext i16 %cast to i32 + ret i32 %ext +} + +define i32 @atomic_load_local_monotonic_i16_d16_hi_shift(ptr addrspace(3) %ptr) { +; GFX7-LABEL: atomic_load_local_monotonic_i16_d16_hi_shift: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i16_d16_hi_shift: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_hi_shift: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + %shl = shl i32 %ext, 16 + ret i32 %shl +} + +define <2 x i16> @atomic_load_local_monotonic_i16_d16_hi_vector_insert(ptr addrspace(3) %ptr, <2 x i16> %vec) { +; GFX7-LABEL: atomic_load_local_monotonic_i16_d16_hi_vector_insert: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i16_d16_hi_vector_insert: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_hi_vector_insert: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 + %insert = insertelement <2 x i16> %vec, i16 %load, i32 1 + ret <2 x i16> %insert +} + +define i32 @atomic_load_local_monotonic_i16_d16_lo_or(ptr addrspace(3) %ptr, i16 %high) { +; GFX7-LABEL: atomic_load_local_monotonic_i16_d16_lo_or: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i16_d16_lo_or: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_lo_or: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 + %ext = zext i16 %load to i32 + %high.ext = zext i16 %high to i32 + %shl = shl i32 %high.ext, 16 + %or = or i32 %shl, %ext + ret i32 %or +} + +define <2 x i16> @atomic_load_local_monotonic_i16_d16_lo_vector_insert(ptr addrspace(3) %ptr, <2 x i16> %vec) { +; GFX7-LABEL: atomic_load_local_monotonic_i16_d16_lo_vector_insert: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: atomic_load_local_monotonic_i16_d16_lo_vector_insert: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_u16 v0, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_lo_vector_insert: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 + %insert = insertelement <2 x i16> %vec, i16 %load, i32 0 + ret <2 x i16> %insert +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 43266554c2d8a..d38a9051175be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 9be4fec5a3b95..3678eb5ac7682 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index e28a1efb75404..ea44612465579 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -1747,7 +1747,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -1899,7 +1899,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -1978,7 +1978,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -2106,7 +2106,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out.gep, align 4 ret void } @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3312,7 +3312,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 - %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i32 %idx.0, ptr addrspace(1) %add_use, align 4 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -3321,5 +3321,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, attributes #0 = { nounwind speculatable willreturn memory(none) } attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index d63044d7cec6d..4023e053c66c5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -2754,7 +2754,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -2856,7 +2856,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -3031,7 +3031,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3110,7 +3110,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3189,7 +3189,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3317,7 +3317,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out.gep, align 4 ret void } @@ -3416,7 +3416,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3524,5 +3524,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, attributes #0 = { nounwind speculatable willreturn memory(none) } attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index b27d8fdc24ff7..935200d595307 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -298,7 +298,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: ; implicit-def: $sgpr6 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s5 ; GFX10-NEXT: s_branch .LBB4_2 ; GFX10-NEXT: .LBB4_1: ; %Flow ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 @@ -312,7 +312,6 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: s_cbranch_execz .LBB4_6 ; GFX10-NEXT: .LBB4_2: ; %cond.block.0 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v4, v5 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4 ; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB4_4 @@ -329,12 +328,11 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4 ; GFX10-NEXT: s_mov_b32 s7, -1 -; GFX10-NEXT: ; implicit-def: $vgpr5 ; GFX10-NEXT: s_and_saveexec_b32 s8, s4 ; GFX10-NEXT: s_cbranch_execz .LBB4_1 ; GFX10-NEXT: ; %bb.5: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4 ; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo ; GFX10-NEXT: s_and_b32 s7, exec_lo, 0 ; GFX10-NEXT: s_or_b32 s7, s4, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll index c1cb74cb0e25a..c9ab351f94016 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll @@ -15,7 +15,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -38,8 +38,9 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 92ce2af47e22a..605c8f7e36919 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1431,7 +1431,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1458,7 +1458,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1483,7 +1483,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1512,7 +1512,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1541,7 +1541,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1713,3 +1713,4 @@ attributes #1 = { nounwind } attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll index 3438cbdd476d8..4b0ff1b2eb470 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-cs-chain.ll @@ -24,9 +24,12 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) - ; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX11-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; GFX11-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; GFX11-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32) + ; GFX11-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) ; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32) ; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5) ; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32) @@ -50,9 +53,12 @@ define amdgpu_cs_chain void @chain_call(<3 x i32> inreg %sgpr, { i32, ptr addrsp ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; GFX10-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32) + ; GFX10-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) ; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32) ; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5) ; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32) @@ -82,9 +88,12 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX11-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) - ; GFX11-NEXT: $sgpr0 = COPY [[UV]](s32) - ; GFX11-NEXT: $sgpr1 = COPY [[UV1]](s32) - ; GFX11-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX11-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; GFX11-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX11-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; GFX11-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; GFX11-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32) + ; GFX11-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) ; GFX11-NEXT: $vgpr8 = COPY [[COPY3]](s32) ; GFX11-NEXT: $vgpr9 = COPY [[COPY4]](p5) ; GFX11-NEXT: $vgpr10 = COPY [[COPY5]](s32) @@ -108,9 +117,12 @@ define amdgpu_cs_chain void @chain_preserve_call(<3 x i32> inreg %sgpr, { i32, p ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10-NEXT: [[GV1:%[0-9]+]]:ccr_sgpr_64(p0) = G_GLOBAL_VALUE @callee_preserve ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: $sgpr0 = COPY [[UV]](s32) - ; GFX10-NEXT: $sgpr1 = COPY [[UV1]](s32) - ; GFX10-NEXT: $sgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; GFX10-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; GFX10-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; GFX10-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; GFX10-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32) + ; GFX10-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) ; GFX10-NEXT: $vgpr8 = COPY [[COPY3]](s32) ; GFX10-NEXT: $vgpr9 = COPY [[COPY4]](p5) ; GFX10-NEXT: $vgpr10 = COPY [[COPY5]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll index 5effd24a75208..adad38de380d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll @@ -50,7 +50,8 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 @@ -99,8 +100,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[LOAD2]](s32) + ; CHECK-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index c3694158e7b97..96c3575e3190c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -942,7 +942,8 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 @@ -3984,8 +3985,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[LOAD2]](s32) + ; CHECK-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>) ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 @@ -5309,7 +5312,8 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) - ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -5354,7 +5358,8 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -5402,8 +5407,10 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) @@ -5451,8 +5458,10 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>) - ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) @@ -5499,7 +5508,8 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) - ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -5546,7 +5556,8 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) - ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -5591,7 +5602,8 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -5639,8 +5651,10 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) - ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) @@ -5685,7 +5699,9 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -5738,8 +5754,12 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF]](s16) ; CHECK-NEXT: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s16>) - ; CHECK-NEXT: $sgpr0 = COPY [[UV7]](<2 x s16>) - ; CHECK-NEXT: $sgpr1 = COPY [[UV8]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) @@ -5787,8 +5807,12 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>) - ; CHECK-NEXT: $sgpr0 = COPY [[UV]](<2 x s16>) - ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[BITCAST1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) @@ -5836,8 +5860,10 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p0) - ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) @@ -5885,8 +5911,10 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](p1) - ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) @@ -5931,7 +5959,9 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: $sgpr0 = COPY [[COPY9]](p3) + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY9]](p3) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[PTRTOINT]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -5983,10 +6013,14 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p1>) - ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32) - ; CHECK-NEXT: $sgpr2 = COPY [[UV2]](s32) - ; CHECK-NEXT: $sgpr3 = COPY [[UV3]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV2]](s32) + ; CHECK-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV3]](s32) + ; CHECK-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32) ; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY13]](p4) @@ -6034,8 +6068,10 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]] ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p5>) - ; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32) ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY11]](p4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-sextload-from-sextinreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-sextload-from-sextinreg.mir index afa81980ebd62..23b80528c80a9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-sextload-from-sextinreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-sextload-from-sextinreg.mir @@ -133,7 +133,6 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (volatile load (s8), addrspace 1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (volatile load (s8), addrspace 1) ; CHECK-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_LOAD %0 :: (volatile load (s8), align 1, addrspace 1) @@ -172,7 +171,6 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (volatile load (s16), addrspace 1) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (volatile load (s16), addrspace 1) ; CHECK-NEXT: $vgpr0 = COPY [[SEXTLOAD]](s32) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_LOAD %0 :: (volatile load (s16), align 2, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 49b450a9af0bc..4d26453e1a0d6 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -576,11 +576,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v4, s8 ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 -; GFX908-NEXT: v_mov_b32_e32 v8, s8 ; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v8, s8 ; GFX908-NEXT: v_mov_b32_e32 v5, s9 -; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: v_mov_b32_e32 v7, s9 +; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] @@ -641,10 +641,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 ; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 ; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 -; GFX908-NEXT: v_add_f32_e32 v9, v9, v27 -; GFX908-NEXT: v_add_f32_e32 v8, v8, v26 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX908-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX908-NEXT: v_add_f32_e32 v7, v7, v27 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v26 +; GFX908-NEXT: v_add_f32_e32 v8, v8, v14 +; GFX908-NEXT: v_add_f32_e32 v9, v9, v15 ; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 ; GFX908-NEXT: s_mov_b64 s[20:21], -1 @@ -654,10 +654,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard @@ -743,8 +739,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0 ; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] @@ -800,8 +796,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] ; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[16:17] +; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] ; GFX90A-NEXT: s_mov_b64 s[20:21], -1 ; GFX90A-NEXT: s_branch .LBB3_4 @@ -810,10 +806,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: ; implicit-def: $vgpr12_vgpr13 -; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll index 06f66e05d6747..8ca3e8255b634 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -501,6 +501,79 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { unreachable } +; Chain call with SGPR arguments that we cannot prove are uniform. +define amdgpu_cs void @cs_to_chain_nonuniform(<3 x i32> %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: cs_to_chain_nonuniform: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GISEL-GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v10, v5 +; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: cs_to_chain_nonuniform: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_getpc_b64 s[100:101] +; GISEL-GFX10-NEXT: s_mov_b32 s100, s0 +; GISEL-GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[100:103], s[100:101], 0x10 +; GISEL-GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v9, v4 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v10, v5 +; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX10-NEXT: s_bitset0_b32 s103, 21 +; GISEL-GFX10-NEXT: s_add_u32 s100, s100, s0 +; GISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 +; GISEL-GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; GISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: cs_to_chain_nonuniform: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; DAGISEL-GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; DAGISEL-GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v10, v5 +; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: cs_to_chain_nonuniform: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_getpc_b64 s[100:101] +; DAGISEL-GFX10-NEXT: s_mov_b32 s100, s0 +; DAGISEL-GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; DAGISEL-GFX10-NEXT: s_load_dwordx4 s[100:103], s[100:101], 0x10 +; DAGISEL-GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v9, v4 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v10, v5 +; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; DAGISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_bitset0_b32 s103, 21 +; DAGISEL-GFX10-NEXT: s_add_u32 s100, s100, s0 +; DAGISEL-GFX10-NEXT: s_addc_u32 s101, s101, 0 +; DAGISEL-GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; DAGISEL-GFX10-NEXT: s_mov_b64 s[48:49], s[100:101] +; DAGISEL-GFX10-NEXT: s_mov_b64 s[50:51], s[102:103] +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %b, i32 0) + unreachable +} + define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX11-LABEL: chain_to_chain: ; GISEL-GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index f54ea615ca664..c57a35aa1880d 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -17,7 +17,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: v_bfe_u32 v2, v2, 0, v3 +; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -36,7 +38,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_bfe_u32 v2, v3, 0, v4 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -215,7 +219,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: v_bfe_i32 v2, v2, 0, v3 +; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -234,7 +240,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_bfe_i32 v2, v3, 0, v4 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 4143c65a840d7..662de47413654 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -1052,16 +1052,15 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX9-NEXT: s_mov_b32 s6, 0x80000001 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3] -; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6 -; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6 -; GFX9-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-NEXT: v_lshl_add_u32 v8, v6, 31, v6 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8 +; GFX9-NEXT: v_add3_u32 v7, v7, v8, v6 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1 @@ -1085,10 +1084,9 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2 -; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2 +; GFX942-NEXT: v_lshl_add_u32 v4, v2, 31, v2 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0 -; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4 +; GFX942-NEXT: v_add3_u32 v3, v3, v4, v2 ; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5] @@ -1125,17 +1123,16 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v8, v5 +; GFX1030-NEXT: v_mov_b32_e32 v7, v5 ; GFX1030-NEXT: v_mov_b32_e32 v5, v3 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0 ; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5] -; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7 +; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v2 ; GFX1030-NEXT: v_mov_b32_e32 v4, v5 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3] -; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4 +; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4 ; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1 ; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5] @@ -1167,16 +1164,15 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX9-NEXT: s_mov_b32 s6, 0x80000001 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, 3, v[2:3] -; GFX9-NEXT: v_mul_i32_i24_e32 v8, 3, v6 -; GFX9-NEXT: v_lshl_add_u32 v9, v6, 31, v6 -; GFX9-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-NEXT: v_lshl_add_u32 v8, v6, 31, v6 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v5 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, 3, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_add3_u32 v7, v7, v9, v8 +; GFX9-NEXT: v_add3_u32 v7, v7, v8, v6 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, -1, v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v1 @@ -1195,10 +1191,9 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX942-NEXT: v_mul_i32_i24_e32 v4, 3, v2 -; GFX942-NEXT: v_lshl_add_u32 v5, v2, 31, v2 +; GFX942-NEXT: v_lshl_add_u32 v4, v2, 31, v2 ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, 0 -; GFX942-NEXT: v_add3_u32 v3, v3, v5, v4 +; GFX942-NEXT: v_add3_u32 v3, v3, v4, v2 ; GFX942-NEXT: v_mul_hi_u32 v4, v0, 3 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, 3, v[4:5] @@ -1227,17 +1222,16 @@ define noundef i64 @sdiv64_i32max(i64 noundef %i) { ; GFX1030-NEXT: v_mul_hi_u32 v2, v0, 3 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GFX1030-NEXT: v_mul_i32_i24_e32 v7, 3, v6 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v1, 3, v[2:3] -; GFX1030-NEXT: v_mov_b32_e32 v8, v5 +; GFX1030-NEXT: v_mov_b32_e32 v7, v5 ; GFX1030-NEXT: v_mov_b32_e32 v5, v3 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v6, 3, 0 ; GFX1030-NEXT: v_lshl_add_u32 v6, v6, 31, v6 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0x80000001, v0, v[4:5] -; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v7 +; GFX1030-NEXT: v_add3_u32 v3, v3, v6, v2 ; GFX1030-NEXT: v_mov_b32_e32 v4, v5 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v0, -1, v[2:3] -; GFX1030-NEXT: v_add_co_u32 v4, s4, v8, v4 +; GFX1030-NEXT: v_add_co_u32 v4, s4, v7, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v5, null, 0, 0, s4 ; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v3, v1 ; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x80000001, v1, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll index 9677ec41ce268..3d9616f02d52d 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll @@ -150,11 +150,21 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; ---------------------------------------------------------------------------- ; define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_d0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: bzhi32_d0: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bzhi32_d0: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] %numhighbits = sub i32 32, %numlowbits %highbitscleared = shl i32 %val, %numhighbits %masked = lshr i32 %highbitscleared, %numhighbits diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index 32cb1056022de..d64becc74ddc2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) { ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic @@ -73,7 +71,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -123,7 +121,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 61cac642d19e8..ff48a3fc98018 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -5669,9 +5669,17 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB30_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -5683,11 +5691,27 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: .LBB30_4: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB30_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5696,18 +5720,54 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB30_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB30_4 +; GFX940-NEXT: .LBB30_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB30_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB30_2 +; GFX940-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -5719,11 +5779,25 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: .LBB30_4: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5731,9 +5805,16 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execz .LBB30_4 +; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -5745,10 +5826,28 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB30_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: .LBB30_4: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB30_6 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: .LBB30_6: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 @@ -5757,17 +5856,54 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB30_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB30_4 +; GFX90A-NEXT: .LBB30_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB30_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB30_2 +; GFX90A-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB30_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -5777,24 +5913,50 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB30_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: .LBB30_4: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB30_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB30_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -5804,24 +5966,51 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB30_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: .LBB30_4: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB30_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB30_4 +; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] ; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -5831,13 +6020,31 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB30_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: .LBB30_4: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB30_6 +; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -5851,63 +6058,151 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB31_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB31_6 +; GFX12-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB31_2 +; GFX12-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB31_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB31_4 +; GFX940-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB31_2 +; GFX940-NEXT: .LBB31_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB31_6 +; GFX11-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5915,9 +6210,22 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB31_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB31_6 +; GFX10-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -5929,56 +6237,145 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB31_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB31_2 +; GFX10-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB31_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB31_4 +; GFX90A-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB31_2 +; GFX90A-NEXT: .LBB31_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB31_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB31_6 +; GFX908-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB31_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB31_2 +; GFX908-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB31_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB31_6 +; GFX8-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -5988,24 +6385,55 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB31_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB31_2 +; GFX8-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB31_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB31_6 +; GFX7-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -6015,11 +6443,27 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB31_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB31_2 +; GFX7-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6034,70 +6478,151 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB32_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB32_6 +; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB32_2 +; GFX12-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB32_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB32_4 +; GFX940-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB32_2 +; GFX940-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB32_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB32_6 +; GFX11-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[2:3] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6106,9 +6631,22 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB32_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB32_6 +; GFX10-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -6120,21 +6658,65 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB32_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB32_2 +; GFX10-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_4 +; GFX90A-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB32_2 +; GFX90A-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6142,38 +6724,79 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB32_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB32_6 +; GFX908-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB32_4: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB32_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB32_2 +; GFX908-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB32_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB32_6 +; GFX8-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -6183,24 +6806,55 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB32_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB32_2 +; GFX8-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB32_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB32_6 +; GFX7-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -6210,11 +6864,27 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB32_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB32_2 +; GFX7-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6229,9 +6899,25 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB33_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB33_6 +; GFX12-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] @@ -6242,11 +6928,24 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB33_2 +; GFX12-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -6254,18 +6953,56 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB33_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB33_4 +; GFX940-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB33_2 +; GFX940-NEXT: .LBB33_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB33_6 +; GFX11-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6276,20 +7013,44 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB33_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB33_6 +; GFX10-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6301,27 +7062,82 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB33_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB33_2 +; GFX10-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_4 +; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB33_2 +; GFX90A-NEXT: .LBB33_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB33_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB33_6 +; GFX908-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6330,23 +7146,52 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB33_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB33_2 +; GFX908-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB33_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB33_6 +; GFX8-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6355,23 +7200,53 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB33_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB33_2 +; GFX8-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB33_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB33_6 +; GFX7-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6380,12 +7255,28 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB33_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB33_2 +; GFX7-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -6399,24 +7290,56 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB34_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB34_6 +; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB34_2 +; GFX12-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -6424,33 +7347,88 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB34_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB34_4 +; GFX940-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:2040 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB34_2 +; GFX940-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB34_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB34_6 +; GFX11-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6459,9 +7437,21 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB34_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB34_6 +; GFX10-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6473,95 +7463,229 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB34_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB34_2 +; GFX10-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:2040 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB34_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB34_4 +; GFX90A-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB34_2 +; GFX90A-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB34_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB34_6 +; GFX908-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB34_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB34_2 +; GFX908-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB34_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB34_6 +; GFX8-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB34_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB34_2 +; GFX8-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x7f8, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB34_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB34_6 +; GFX7-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB34_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB34_2 +; GFX7-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6576,24 +7700,56 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB35_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB35_6 +; GFX12-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB35_2 +; GFX12-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -6601,25 +7757,62 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB35_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB35_4 +; GFX940-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB35_2 +; GFX940-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b64 v[6:7], v[4:5] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_6 +; GFX11-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6630,11 +7823,23 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6643,9 +7848,21 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB35_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB35_6 +; GFX10-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6657,10 +7874,26 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB35_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB35_2 +; GFX10-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6669,89 +7902,201 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB35_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB35_4 +; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB35_2 +; GFX90A-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB35_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB35_6 +; GFX908-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB35_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB35_2 +; GFX908-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB35_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB35_6 +; GFX8-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB35_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB35_2 +; GFX8-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB35_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB35_6 +; GFX7-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB35_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB35_2 +; GFX7-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index ad5498723940d..36aa73fbf8e92 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -2770,148 +2770,382 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB18_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB18_2 +; GFX940-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB18_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB18_2 +; GFX10-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB18_2 +; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB18_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB18_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB18_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB18_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB18_2 +; GFX7-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -2925,127 +3159,329 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_6 +; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB19_2 +; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_4 +; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB19_2 +; GFX940-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_6 +; GFX11-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_4 +; GFX10-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB19_2 +; GFX10-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 +; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB19_2 +; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_6 +; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB19_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB19_2 +; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_6 +; GFX8-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3056,21 +3492,71 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB19_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB19_2 +; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_4 +; GFX7-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB19_2 +; GFX7-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3085,141 +3571,330 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_6 +; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB20_2 +; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_4 +; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB20_2 +; GFX940-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_6 +; GFX11-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_4 +; GFX10-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB20_2 +; GFX10-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 +; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB20_2 +; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: s_mov_b64 s[4:5], vcc -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v4, vcc -; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_addc_co_u32_e64 v6, vcc, -1, v4, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_6 +; GFX908-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB20_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB20_2 +; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_6 +; GFX8-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3230,21 +3905,71 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB20_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_4 +; GFX7-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB20_2 +; GFX7-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3259,10 +3984,26 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_6 +; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3275,11 +4016,26 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB21_2 +; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3287,19 +4043,59 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB21_2 +; GFX940-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_6 +; GFX11-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3312,40 +4108,126 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB21_2 +; GFX10-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB21_2 +; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_6 +; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3355,24 +4237,54 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB21_2 +; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_6 +; GFX8-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[2:3] +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3382,20 +4294,69 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB21_2 +; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB21_2 +; GFX7-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -3409,27 +4370,61 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_6 +; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB22_2 +; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3437,36 +4432,95 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_4 +; GFX940-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] offset:2040 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB22_2 +; GFX940-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_6 +; GFX11-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3475,83 +4529,238 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_4 +; GFX10-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB22_2 +; GFX10-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] offset:2040 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_4 +; GFX90A-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB22_2 +; GFX90A-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_6 +; GFX908-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB22_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB22_2 +; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: flat_load_dword v4, v[6:7] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB22_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB22_6 +; GFX8-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB22_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB22_2 +; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB22_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB22_4 +; GFX7-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB22_2 +; GFX7-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3566,27 +4775,61 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_6 +; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB23_2 +; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3594,43 +4837,96 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_4 +; GFX940-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB23_2 +; GFX940-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: flat_load_b64 v[4:5], v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_6 +; GFX11-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3639,12 +4935,43 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_4 +; GFX10-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB23_2 +; GFX10-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3652,77 +4979,194 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_4 +; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB23_2 +; GFX90A-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], vcc -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_addc_co_u32_e64 v7, vcc, -1, v1, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_6 +; GFX908-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB23_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB23_2 +; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: flat_load_dword v4, v[6:7] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB23_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB23_6 +; GFX8-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB23_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB23_2 +; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB23_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB23_4 +; GFX7-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB23_2 +; GFX7-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3737,204 +5181,416 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB24_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB24_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB24_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_4 +; GFX940-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB24_2 +; GFX940-NEXT: .LBB24_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB24_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execz .LBB24_4 +; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB24_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: .LBB24_4: ; %Flow2 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB24_6 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB24_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB24_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB24_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB24_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB24_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB24_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB24_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB24_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB24_4 +; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[2:3] +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB24_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: .LBB24_4: ; %Flow2 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB24_6 +; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret double %result @@ -3948,148 +5604,382 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB25_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB25_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB25_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_4 +; GFX940-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB25_2 +; GFX940-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB25_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB25_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB25_4 +; GFX10-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB25_2 +; GFX10-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_4 +; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB25_2 +; GFX90A-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB25_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB25_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB25_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB25_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB25_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB25_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB25_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB25_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB25_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB25_4 +; GFX7-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB25_2 +; GFX7-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index dbf2626ec4d4f..d96d3db9f005d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -2770,148 +2770,382 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB18_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB18_2 +; GFX940-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB18_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB18_2 +; GFX10-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB18_2 +; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB18_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB18_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB18_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB18_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB18_2 +; GFX7-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -2925,127 +3159,329 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_6 +; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB19_2 +; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_4 +; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB19_2 +; GFX940-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_6 +; GFX11-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_4 +; GFX10-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB19_2 +; GFX10-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 +; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB19_2 +; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_6 +; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB19_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB19_2 +; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_6 +; GFX8-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3056,21 +3492,71 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB19_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB19_2 +; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_4 +; GFX7-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB19_2 +; GFX7-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3085,141 +3571,330 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_6 +; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB20_2 +; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_4 +; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB20_2 +; GFX940-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_6 +; GFX11-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_4 +; GFX10-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB20_2 +; GFX10-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 +; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB20_2 +; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: s_mov_b64 s[4:5], vcc -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v4, vcc -; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_addc_co_u32_e64 v6, vcc, -1, v4, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_6 +; GFX908-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB20_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB20_2 +; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_6 +; GFX8-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3230,21 +3905,71 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB20_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_4 +; GFX7-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB20_2 +; GFX7-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3259,10 +3984,26 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_6 +; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3275,11 +4016,26 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB21_2 +; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3287,19 +4043,59 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB21_2 +; GFX940-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_6 +; GFX11-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3312,40 +4108,126 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB21_2 +; GFX10-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB21_2 +; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_6 +; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3355,24 +4237,54 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB21_2 +; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_6 +; GFX8-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[2:3] +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3382,20 +4294,69 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB21_2 +; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB21_2 +; GFX7-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -3409,27 +4370,61 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_6 +; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB22_2 +; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3437,36 +4432,95 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_4 +; GFX940-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB22_2 +; GFX940-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_6 +; GFX11-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3475,83 +4529,238 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_4 +; GFX10-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB22_2 +; GFX10-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_4 +; GFX90A-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB22_2 +; GFX90A-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_6 +; GFX908-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB22_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB22_2 +; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: flat_load_dword v4, v[6:7] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB22_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB22_6 +; GFX8-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB22_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB22_2 +; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB22_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB22_4 +; GFX7-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB22_2 +; GFX7-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3566,27 +4775,61 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_6 +; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB23_2 +; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3594,43 +4837,96 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_4 +; GFX940-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB23_2 +; GFX940-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: flat_load_b64 v[4:5], v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_6 +; GFX11-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3639,12 +4935,43 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_4 +; GFX10-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB23_2 +; GFX10-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3652,77 +4979,194 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_4 +; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB23_2 +; GFX90A-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], vcc -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_addc_co_u32_e64 v7, vcc, -1, v1, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_6 +; GFX908-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB23_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB23_2 +; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: flat_load_dword v4, v[6:7] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB23_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB23_6 +; GFX8-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB23_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB23_2 +; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB23_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB23_4 +; GFX7-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB23_2 +; GFX7-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3737,204 +5181,416 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB24_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB24_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB24_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_4 +; GFX940-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB24_2 +; GFX940-NEXT: .LBB24_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB24_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execz .LBB24_4 +; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB24_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: .LBB24_4: ; %Flow2 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB24_6 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB24_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB24_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB24_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB24_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB24_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB24_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB24_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB24_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB24_4 +; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[2:3] +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB24_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: .LBB24_4: ; %Flow2 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB24_6 +; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret double %result @@ -3948,148 +5604,382 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB25_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB25_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB25_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_4 +; GFX940-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB25_2 +; GFX940-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB25_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB25_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB25_4 +; GFX10-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB25_2 +; GFX10-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_4 +; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB25_2 +; GFX90A-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB25_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB25_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB25_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB25_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB25_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB25_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB25_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB25_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB25_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB25_4 +; GFX7-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB25_2 +; GFX7-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 9cc4f3987b320..14f75814128f1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -3234,9 +3234,17 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB16_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3248,11 +3256,27 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: .LBB16_4: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB16_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[0:1], v[4:5], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3261,9 +3285,16 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB16_4 +; GFX940-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] @@ -3273,21 +3304,44 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB16_2 +; GFX940-NEXT: ; %bb.3: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: .LBB16_4: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB16_6 +; GFX940-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[0:1], off sc0 sc1 +; GFX940-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3299,11 +3353,25 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: .LBB16_4: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3311,9 +3379,16 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execz .LBB16_4 +; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -3325,10 +3400,28 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB16_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: .LBB16_4: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB16_6 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: .LBB16_6: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 @@ -3337,9 +3430,16 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] @@ -3348,21 +3448,45 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB16_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB16_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3372,24 +3496,50 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB16_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: .LBB16_4: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB16_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB16_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -3399,24 +3549,51 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB16_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: .LBB16_4: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB16_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB16_4 +; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] ; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -3426,13 +3603,31 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: .LBB16_4: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB16_6 +; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst ret double %result @@ -3446,78 +3641,164 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_6 +; GFX12-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB17_2 +; GFX12-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[2:3], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_6 +; GFX940-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] +; GFX940-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB17_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB17_2 +; GFX940-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_6 +; GFX11-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -3525,9 +3806,22 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_6 +; GFX10-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -3539,71 +3833,158 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB17_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB17_2 +; GFX10-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_6 +; GFX90A-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB17_2 +; GFX90A-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_6 +; GFX908-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB17_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB17_2 +; GFX908-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_6 +; GFX8-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3613,24 +3994,55 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB17_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB17_2 +; GFX8-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_6 +; GFX7-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -3640,11 +4052,27 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB17_2 +; GFX7-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -3659,44 +4087,84 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_6 +; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_2 +; GFX12-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[2:3], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 ; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_6 +; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] @@ -3706,41 +4174,77 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB18_2 +; GFX940-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_6 +; GFX11-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], -v[2:3] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3749,9 +4253,22 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_6 +; GFX10-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -3763,10 +4280,26 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB18_2 +; GFX10-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3775,11 +4308,22 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_6 +; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] @@ -3788,11 +4332,26 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB18_2 +; GFX90A-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3800,11 +4359,22 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_6 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -3814,24 +4384,54 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_2 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_6 +; GFX8-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3841,24 +4441,55 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_2 +; GFX8-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_6 +; GFX7-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -3868,11 +4499,27 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB18_2 +; GFX7-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -3887,9 +4534,25 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_6 +; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] @@ -3900,11 +4563,24 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB19_2 +; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3912,9 +4588,21 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_6 +; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -3923,20 +4611,46 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB19_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB19_2 +; GFX940-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_6 +; GFX11-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -3947,20 +4661,44 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_6 +; GFX10-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -3972,19 +4710,47 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB19_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB19_2 +; GFX10-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_6 +; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -3992,20 +4758,47 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB19_2 +; GFX90A-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_6 +; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4014,23 +4807,52 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB19_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB19_2 +; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_6 +; GFX8-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4039,23 +4861,53 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB19_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB19_2 +; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_6 +; GFX7-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4064,12 +4916,28 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB19_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB19_2 +; GFX7-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst ret void @@ -4083,24 +4951,56 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_6 +; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB20_2 +; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4108,46 +5008,101 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_6 +; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB20_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB20_2 +; GFX940-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_6 +; GFX11-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4156,9 +5111,21 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_6 +; GFX10-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4170,108 +5137,242 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB20_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB20_2 +; GFX10-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_6 +; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB20_2 +; GFX90A-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_6 +; GFX908-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB20_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB20_2 +; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_6 +; GFX8-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB20_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x7f8, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_6 +; GFX7-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB20_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB20_2 +; GFX7-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -4286,24 +5387,56 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_6 +; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB21_2 +; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4311,15 +5444,24 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_6 +; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4328,24 +5470,49 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB21_2 +; GFX940-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b64 v[6:7], v[4:5] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_6 +; GFX11-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4356,11 +5523,23 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4369,9 +5548,21 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_6 +; GFX10-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4383,116 +5574,242 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB21_2 +; GFX10-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_6 +; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB21_2 +; GFX90A-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_6 +; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB21_2 +; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_6 +; GFX8-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB21_2 +; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_6 +; GFX7-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB21_2 +; GFX7-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index eded1ee04625b..839f4a18508e5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -6,43 +6,136 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB0_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB0_4 +; GCN1-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB0_2 +; GCN1-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB0_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB0_4 +; GCN2-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB0_2 +; GCN2-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB0_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB0_4 +; GFX12-NEXT: .LBB0_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB0_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB0_2 +; GFX12-NEXT: .LBB0_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -53,39 +146,104 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB1_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB1_3 +; GCN1-NEXT: s_branch .LBB1_4 +; GCN1-NEXT: .LBB1_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB1_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB1_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB1_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB1_3 +; GCN2-NEXT: s_branch .LBB1_4 +; GCN2-NEXT: .LBB1_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB1_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB1_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -94,12 +252,34 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB1_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB1_3 +; GFX12-NEXT: s_branch .LBB1_4 +; GFX12-NEXT: .LBB1_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB1_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB1_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -113,40 +293,109 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB2_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB2_4 +; GCN1-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB2_2 +; GCN1-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB2_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB2_4 +; GCN2-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB2_2 +; GCN2-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_addr64_offset: @@ -156,13 +405,37 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB2_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB2_4 +; GFX12-NEXT: .LBB2_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB2_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB2_2 +; GFX12-NEXT: .LBB2_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -174,43 +447,108 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB3_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB3_3 +; GCN1-NEXT: s_branch .LBB3_4 +; GCN1-NEXT: .LBB3_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB3_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB3_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB3_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB3_3 +; GCN2-NEXT: s_branch .LBB3_4 +; GCN2-NEXT: .LBB3_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB3_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB3_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -219,13 +557,35 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB3_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB3_3 +; GFX12-NEXT: s_branch .LBB3_4 +; GFX12-NEXT: .LBB3_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB3_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB3_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -240,39 +600,130 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB4_4 +; GCN1-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB4_2 +; GCN1-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB4_4 +; GCN2-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB4_2 +; GCN2-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB4_4 +; GFX12-NEXT: .LBB4_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB4_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB4_2 +; GFX12-NEXT: .LBB4_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst @@ -282,16 +733,49 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB5_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB5_3 +; GCN1-NEXT: s_branch .LBB5_4 +; GCN1-NEXT: .LBB5_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB5_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB5_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -299,16 +783,48 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB5_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB5_3 +; GCN2-NEXT: s_branch .LBB5_4 +; GCN2-NEXT: .LBB5_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB5_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB5_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -319,12 +835,32 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB5_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB5_3 +; GFX12-NEXT: s_branch .LBB5_4 +; GFX12-NEXT: .LBB5_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB5_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB5_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -337,36 +873,105 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB6_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB6_4 +; GCN1-NEXT: .LBB6_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB6_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB6_2 +; GCN1-NEXT: .LBB6_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB6_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB6_4 +; GCN2-NEXT: .LBB6_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB6_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB6_2 +; GCN2-NEXT: .LBB6_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_addr64: @@ -374,15 +979,38 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB6_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB6_4 +; GFX12-NEXT: .LBB6_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB6_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB6_2 +; GFX12-NEXT: .LBB6_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -393,54 +1021,140 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB7_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB7_3 +; GCN1-NEXT: s_branch .LBB7_4 +; GCN1-NEXT: .LBB7_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB7_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB7_3 +; GCN2-NEXT: s_branch .LBB7_4 +; GCN2-NEXT: .LBB7_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB7_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB7_3 +; GFX12-NEXT: s_branch .LBB7_4 +; GFX12-NEXT: .LBB7_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB7_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB7_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -454,43 +1168,134 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB8_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB8_4 +; GCN1-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB8_2 +; GCN1-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB8_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB8_4 +; GCN2-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB8_2 +; GCN2-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB8_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB8_4 +; GFX12-NEXT: .LBB8_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB8_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB8_2 +; GFX12-NEXT: .LBB8_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -501,39 +1306,102 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB9_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB9_3 +; GCN1-NEXT: s_branch .LBB9_4 +; GCN1-NEXT: .LBB9_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB9_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB9_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB9_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB9_3 +; GCN2-NEXT: s_branch .LBB9_4 +; GCN2-NEXT: .LBB9_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB9_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB9_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -542,12 +1410,34 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB9_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB9_3 +; GFX12-NEXT: s_branch .LBB9_4 +; GFX12-NEXT: .LBB9_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB9_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB9_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -561,40 +1451,107 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB10_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB10_4 +; GCN1-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB10_2 +; GCN1-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB10_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB10_4 +; GCN2-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB10_2 +; GCN2-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_addr64_offset: @@ -604,13 +1561,37 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB10_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB10_4 +; GFX12-NEXT: .LBB10_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB10_2 +; GFX12-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -622,43 +1603,106 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB11_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB11_3 +; GCN1-NEXT: s_branch .LBB11_4 +; GCN1-NEXT: .LBB11_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB11_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB11_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB11_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB11_3 +; GCN2-NEXT: s_branch .LBB11_4 +; GCN2-NEXT: .LBB11_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB11_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB11_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -667,13 +1711,35 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB11_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB11_3 +; GFX12-NEXT: s_branch .LBB11_4 +; GFX12-NEXT: .LBB11_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB11_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB11_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -688,39 +1754,128 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB12_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB12_4 +; GCN1-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB12_2 +; GCN1-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB12_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB12_4 +; GCN2-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB12_2 +; GCN2-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB12_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB12_4 +; GFX12-NEXT: .LBB12_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB12_2 +; GFX12-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst @@ -730,16 +1885,48 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB13_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB13_3 +; GCN1-NEXT: s_branch .LBB13_4 +; GCN1-NEXT: .LBB13_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB13_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB13_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -747,16 +1934,47 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB13_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB13_3 +; GCN2-NEXT: s_branch .LBB13_4 +; GCN2-NEXT: .LBB13_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB13_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB13_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -767,12 +1985,32 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB13_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB13_3 +; GFX12-NEXT: s_branch .LBB13_4 +; GFX12-NEXT: .LBB13_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB13_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB13_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -785,36 +2023,103 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB14_4 +; GCN1-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB14_2 +; GCN1-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB14_4 +; GCN2-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB14_2 +; GCN2-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_addr64: @@ -822,15 +2127,38 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB14_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB14_4 +; GFX12-NEXT: .LBB14_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB14_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB14_2 +; GFX12-NEXT: .LBB14_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -841,54 +2169,138 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB15_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB15_3 +; GCN1-NEXT: s_branch .LBB15_4 +; GCN1-NEXT: .LBB15_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB15_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB15_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB15_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB15_3 +; GCN2-NEXT: s_branch .LBB15_4 +; GCN2-NEXT: .LBB15_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB15_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB15_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB15_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB15_3 +; GFX12-NEXT: s_branch .LBB15_4 +; GFX12-NEXT: .LBB15_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB15_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -902,43 +2314,136 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB16_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB16_4 +; GCN1-NEXT: .LBB16_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB16_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB16_2 +; GCN1-NEXT: .LBB16_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB16_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB16_4 +; GCN2-NEXT: .LBB16_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB16_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB16_2 +; GCN2-NEXT: .LBB16_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB16_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB16_4 +; GFX12-NEXT: .LBB16_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB16_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB16_2 +; GFX12-NEXT: .LBB16_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -949,39 +2454,104 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB17_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB17_3 +; GCN1-NEXT: s_branch .LBB17_4 +; GCN1-NEXT: .LBB17_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB17_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB17_3 +; GCN2-NEXT: s_branch .LBB17_4 +; GCN2-NEXT: .LBB17_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -990,12 +2560,34 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB17_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB17_3 +; GFX12-NEXT: s_branch .LBB17_4 +; GFX12-NEXT: .LBB17_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB17_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB17_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1009,40 +2601,109 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB18_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB18_4 +; GCN1-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB18_2 +; GCN1-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB18_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB18_4 +; GCN2-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB18_2 +; GCN2-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_addr64_offset: @@ -1052,13 +2713,37 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB18_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB18_4 +; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB18_2 +; GFX12-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1070,43 +2755,108 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB19_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB19_3 +; GCN1-NEXT: s_branch .LBB19_4 +; GCN1-NEXT: .LBB19_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB19_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB19_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB19_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB19_3 +; GCN2-NEXT: s_branch .LBB19_4 +; GCN2-NEXT: .LBB19_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB19_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB19_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -1115,13 +2865,35 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB19_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB19_3 +; GFX12-NEXT: s_branch .LBB19_4 +; GFX12-NEXT: .LBB19_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB19_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1136,39 +2908,130 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB20_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB20_4 +; GCN1-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB20_2 +; GCN1-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB20_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB20_4 +; GCN2-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB20_2 +; GCN2-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB20_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB20_4 +; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB20_2 +; GFX12-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst @@ -1178,16 +3041,49 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB21_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB21_3 +; GCN1-NEXT: s_branch .LBB21_4 +; GCN1-NEXT: .LBB21_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB21_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB21_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1195,16 +3091,48 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB21_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB21_3 +; GCN2-NEXT: s_branch .LBB21_4 +; GCN2-NEXT: .LBB21_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB21_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB21_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1215,12 +3143,32 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB21_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB21_3 +; GFX12-NEXT: s_branch .LBB21_4 +; GFX12-NEXT: .LBB21_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1233,36 +3181,105 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB22_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB22_4 +; GCN1-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB22_2 +; GCN1-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB22_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB22_4 +; GCN2-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB22_2 +; GCN2-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_addr64: @@ -1270,15 +3287,38 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB22_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB22_4 +; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB22_2 +; GFX12-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1289,54 +3329,140 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB23_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB23_3 +; GCN1-NEXT: s_branch .LBB23_4 +; GCN1-NEXT: .LBB23_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB23_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB23_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB23_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB23_3 +; GCN2-NEXT: s_branch .LBB23_4 +; GCN2-NEXT: .LBB23_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB23_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB23_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB23_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB23_3 +; GFX12-NEXT: s_branch .LBB23_4 +; GFX12-NEXT: .LBB23_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB23_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB23_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1350,41 +3476,137 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB24_4 +; GCN1-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_cbranch_execnz .LBB24_2 +; GCN1-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB24_4 +; GCN2-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_cbranch_execnz .LBB24_2 +; GCN2-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB24_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB24_4 +; GFX12-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB24_2 +; GFX12-NEXT: .LBB24_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -1395,38 +3617,105 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB25_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB25_3 +; GCN1-NEXT: s_branch .LBB25_4 +; GCN1-NEXT: .LBB25_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB25_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB25_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB25_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB25_3 +; GCN2-NEXT: s_branch .LBB25_4 +; GCN2-NEXT: .LBB25_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB25_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB25_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1436,12 +3725,35 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB25_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB25_3 +; GFX12-NEXT: s_branch .LBB25_4 +; GFX12-NEXT: .LBB25_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB25_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB25_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1455,38 +3767,109 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB26_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB26_4 +; GCN1-NEXT: .LBB26_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB26_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB26_2 +; GCN1-NEXT: .LBB26_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB26_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB26_4 +; GCN2-NEXT: .LBB26_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB26_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB26_2 +; GCN2-NEXT: .LBB26_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_addr64_offset: @@ -1496,13 +3879,38 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB26_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB26_4 +; GFX12-NEXT: .LBB26_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB26_2 +; GFX12-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1514,42 +3922,109 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB27_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB27_3 +; GCN1-NEXT: s_branch .LBB27_4 +; GCN1-NEXT: .LBB27_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB27_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB27_3 +; GCN2-NEXT: s_branch .LBB27_4 +; GCN2-NEXT: .LBB27_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1559,13 +4034,36 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB27_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB27_3 +; GFX12-NEXT: s_branch .LBB27_4 +; GFX12-NEXT: .LBB27_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB27_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB27_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1580,37 +4078,131 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB28_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB28_4 +; GCN1-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB28_2 +; GCN1-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB28_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB28_4 +; GCN2-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB28_2 +; GCN2-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB28_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB28_4 +; GFX12-NEXT: .LBB28_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB28_2 +; GFX12-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -1620,15 +4212,49 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB29_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB29_3 +; GCN1-NEXT: s_branch .LBB29_4 +; GCN1-NEXT: .LBB29_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB29_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB29_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -1637,15 +4263,48 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB29_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB29_3 +; GCN2-NEXT: s_branch .LBB29_4 +; GCN2-NEXT: .LBB29_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB29_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB29_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -1657,12 +4316,33 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB29_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB29_3 +; GFX12-NEXT: s_branch .LBB29_4 +; GFX12-NEXT: .LBB29_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB29_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB29_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1675,34 +4355,105 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB30_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB30_4 +; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm -; +; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB30_2 +; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_endpgm +; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB30_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB30_4 +; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB30_2 +; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_addr64: @@ -1710,15 +4461,39 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB30_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB30_4 +; GFX12-NEXT: .LBB30_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB30_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB30_2 +; GFX12-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1729,38 +4504,105 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB31_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB31_3 +; GCN1-NEXT: s_branch .LBB31_4 +; GCN1-NEXT: .LBB31_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB31_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB31_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB31_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB31_3 +; GCN2-NEXT: s_branch .LBB31_4 +; GCN2-NEXT: .LBB31_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB31_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB31_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1768,15 +4610,37 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB31_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB31_3 +; GFX12-NEXT: s_branch .LBB31_4 +; GFX12-NEXT: .LBB31_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB31_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB31_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1790,41 +4654,137 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB32_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB32_4 +; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_cbranch_execnz .LBB32_2 +; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB32_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB32_4 +; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_cbranch_execnz .LBB32_2 +; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB32_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB32_4 +; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB32_2 +; GFX12-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -1835,38 +4795,105 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB33_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB33_3 +; GCN1-NEXT: s_branch .LBB33_4 +; GCN1-NEXT: .LBB33_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB33_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB33_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB33_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB33_3 +; GCN2-NEXT: s_branch .LBB33_4 +; GCN2-NEXT: .LBB33_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB33_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB33_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1876,12 +4903,35 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB33_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB33_3 +; GFX12-NEXT: s_branch .LBB33_4 +; GFX12-NEXT: .LBB33_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB33_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB33_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1895,38 +4945,109 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB34_4 +; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB34_2 +; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB34_4 +; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB34_2 +; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_addr64_offset: @@ -1936,13 +5057,38 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB34_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB34_4 +; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB34_2 +; GFX12-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1954,42 +5100,109 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB35_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB35_3 +; GCN1-NEXT: s_branch .LBB35_4 +; GCN1-NEXT: .LBB35_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB35_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB35_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB35_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB35_3 +; GCN2-NEXT: s_branch .LBB35_4 +; GCN2-NEXT: .LBB35_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB35_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB35_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1999,13 +5212,36 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB35_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB35_3 +; GFX12-NEXT: s_branch .LBB35_4 +; GFX12-NEXT: .LBB35_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB35_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB35_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2020,37 +5256,131 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB36_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB36_4 +; GCN1-NEXT: .LBB36_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB36_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB36_2 +; GCN1-NEXT: .LBB36_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB36_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB36_4 +; GCN2-NEXT: .LBB36_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB36_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB36_2 +; GCN2-NEXT: .LBB36_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB36_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB36_4 +; GFX12-NEXT: .LBB36_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB36_2 +; GFX12-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -2060,15 +5390,49 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB37_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB37_3 +; GCN1-NEXT: s_branch .LBB37_4 +; GCN1-NEXT: .LBB37_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -2077,15 +5441,48 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB37_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB37_3 +; GCN2-NEXT: s_branch .LBB37_4 +; GCN2-NEXT: .LBB37_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -2097,12 +5494,33 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB37_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB37_3 +; GFX12-NEXT: s_branch .LBB37_4 +; GFX12-NEXT: .LBB37_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB37_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB37_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2115,34 +5533,105 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB38_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB38_4 +; GCN1-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB38_2 +; GCN1-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB38_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB38_4 +; GCN2-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB38_2 +; GCN2-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_addr64: @@ -2150,15 +5639,39 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB38_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB38_4 +; GFX12-NEXT: .LBB38_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB38_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB38_2 +; GFX12-NEXT: .LBB38_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2169,38 +5682,105 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB39_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB39_3 +; GCN1-NEXT: s_branch .LBB39_4 +; GCN1-NEXT: .LBB39_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB39_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB39_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB39_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB39_3 +; GCN2-NEXT: s_branch .LBB39_4 +; GCN2-NEXT: .LBB39_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB39_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB39_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2208,15 +5788,37 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB39_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB39_3 +; GFX12-NEXT: s_branch .LBB39_4 +; GFX12-NEXT: .LBB39_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB39_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB39_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2230,41 +5832,137 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB40_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB40_4 +; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_cbranch_execnz .LBB40_2 +; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB40_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB40_4 +; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_cbranch_execnz .LBB40_2 +; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB40_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB40_4 +; GFX12-NEXT: .LBB40_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB40_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB40_2 +; GFX12-NEXT: .LBB40_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -2275,38 +5973,105 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB41_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB41_3 +; GCN1-NEXT: s_branch .LBB41_4 +; GCN1-NEXT: .LBB41_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB41_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB41_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB41_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB41_3 +; GCN2-NEXT: s_branch .LBB41_4 +; GCN2-NEXT: .LBB41_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB41_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB41_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2316,12 +6081,35 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB41_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB41_3 +; GFX12-NEXT: s_branch .LBB41_4 +; GFX12-NEXT: .LBB41_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB41_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB41_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2335,38 +6123,109 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB42_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB42_4 +; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB42_2 +; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB42_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB42_4 +; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB42_2 +; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_addr64_offset: @@ -2376,13 +6235,38 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB42_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB42_4 +; GFX12-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB42_2 +; GFX12-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2394,42 +6278,109 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB43_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB43_3 +; GCN1-NEXT: s_branch .LBB43_4 +; GCN1-NEXT: .LBB43_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB43_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB43_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB43_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB43_3 +; GCN2-NEXT: s_branch .LBB43_4 +; GCN2-NEXT: .LBB43_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB43_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB43_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2439,13 +6390,36 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB43_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB43_3 +; GFX12-NEXT: s_branch .LBB43_4 +; GFX12-NEXT: .LBB43_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB43_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB43_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2460,37 +6434,131 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB44_4 +; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB44_2 +; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB44_4 +; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB44_2 +; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB44_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB44_4 +; GFX12-NEXT: .LBB44_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB44_2 +; GFX12-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -2500,15 +6568,49 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB45_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB45_3 +; GCN1-NEXT: s_branch .LBB45_4 +; GCN1-NEXT: .LBB45_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB45_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB45_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -2517,15 +6619,48 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB45_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB45_3 +; GCN2-NEXT: s_branch .LBB45_4 +; GCN2-NEXT: .LBB45_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB45_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB45_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -2537,12 +6672,33 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB45_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB45_3 +; GFX12-NEXT: s_branch .LBB45_4 +; GFX12-NEXT: .LBB45_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB45_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB45_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2555,34 +6711,105 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB46_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB46_4 +; GCN1-NEXT: .LBB46_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB46_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB46_2 +; GCN1-NEXT: .LBB46_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB46_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB46_4 +; GCN2-NEXT: .LBB46_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB46_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB46_2 +; GCN2-NEXT: .LBB46_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_addr64: @@ -2590,15 +6817,39 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB46_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB46_4 +; GFX12-NEXT: .LBB46_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB46_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB46_2 +; GFX12-NEXT: .LBB46_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2609,38 +6860,105 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB47_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB47_3 +; GCN1-NEXT: s_branch .LBB47_4 +; GCN1-NEXT: .LBB47_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB47_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB47_3 +; GCN2-NEXT: s_branch .LBB47_4 +; GCN2-NEXT: .LBB47_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2648,15 +6966,37 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB47_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB47_3 +; GFX12-NEXT: s_branch .LBB47_4 +; GFX12-NEXT: .LBB47_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB47_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB47_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2670,41 +7010,137 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB48_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB48_4 +; GCN1-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_cbranch_execnz .LBB48_2 +; GCN1-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB48_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB48_4 +; GCN2-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_cbranch_execnz .LBB48_2 +; GCN2-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB48_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB48_4 +; GFX12-NEXT: .LBB48_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB48_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB48_2 +; GFX12-NEXT: .LBB48_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -2715,38 +7151,105 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB49_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB49_3 +; GCN1-NEXT: s_branch .LBB49_4 +; GCN1-NEXT: .LBB49_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB49_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB49_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB49_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB49_3 +; GCN2-NEXT: s_branch .LBB49_4 +; GCN2-NEXT: .LBB49_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB49_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB49_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2756,12 +7259,35 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB49_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB49_3 +; GFX12-NEXT: s_branch .LBB49_4 +; GFX12-NEXT: .LBB49_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB49_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB49_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2775,38 +7301,109 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB50_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB50_4 +; GCN1-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB50_2 +; GCN1-NEXT: .LBB50_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB50_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB50_4 +; GCN2-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB50_2 +; GCN2-NEXT: .LBB50_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_addr64_offset: @@ -2816,13 +7413,38 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB50_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB50_4 +; GFX12-NEXT: .LBB50_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB50_2 +; GFX12-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2834,42 +7456,109 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB51_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB51_3 +; GCN1-NEXT: s_branch .LBB51_4 +; GCN1-NEXT: .LBB51_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB51_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB51_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB51_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB51_3 +; GCN2-NEXT: s_branch .LBB51_4 +; GCN2-NEXT: .LBB51_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB51_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB51_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2879,13 +7568,36 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB51_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB51_3 +; GFX12-NEXT: s_branch .LBB51_4 +; GFX12-NEXT: .LBB51_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB51_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB51_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2900,37 +7612,131 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB52_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB52_4 +; GCN1-NEXT: .LBB52_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB52_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB52_2 +; GCN1-NEXT: .LBB52_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB52_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB52_4 +; GCN2-NEXT: .LBB52_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB52_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB52_2 +; GCN2-NEXT: .LBB52_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB52_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB52_4 +; GFX12-NEXT: .LBB52_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB52_2 +; GFX12-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -2940,15 +7746,49 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB53_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB53_3 +; GCN1-NEXT: s_branch .LBB53_4 +; GCN1-NEXT: .LBB53_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB53_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB53_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -2957,15 +7797,48 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB53_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB53_3 +; GCN2-NEXT: s_branch .LBB53_4 +; GCN2-NEXT: .LBB53_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB53_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB53_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -2977,12 +7850,33 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB53_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB53_3 +; GFX12-NEXT: s_branch .LBB53_4 +; GFX12-NEXT: .LBB53_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB53_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB53_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2995,34 +7889,105 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB54_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB54_4 +; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB54_2 +; GCN1-NEXT: .LBB54_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB54_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB54_4 +; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB54_2 +; GCN2-NEXT: .LBB54_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_addr64: @@ -3030,15 +7995,39 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB54_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB54_4 +; GFX12-NEXT: .LBB54_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB54_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB54_2 +; GFX12-NEXT: .LBB54_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3049,38 +8038,105 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB55_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB55_3 +; GCN1-NEXT: s_branch .LBB55_4 +; GCN1-NEXT: .LBB55_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB55_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB55_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB55_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB55_3 +; GCN2-NEXT: s_branch .LBB55_4 +; GCN2-NEXT: .LBB55_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB55_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB55_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -3088,15 +8144,37 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB55_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB55_3 +; GFX12-NEXT: s_branch .LBB55_4 +; GFX12-NEXT: .LBB55_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB55_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB55_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3110,43 +8188,134 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB56_4 +; GCN1-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB56_2 +; GCN1-NEXT: .LBB56_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB56_4 +; GCN2-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB56_2 +; GCN2-NEXT: .LBB56_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB56_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB56_4 +; GFX12-NEXT: .LBB56_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB56_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB56_2 +; GFX12-NEXT: .LBB56_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -3157,39 +8326,102 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB57_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB57_3 +; GCN1-NEXT: s_branch .LBB57_4 +; GCN1-NEXT: .LBB57_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB57_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB57_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB57_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB57_3 +; GCN2-NEXT: s_branch .LBB57_4 +; GCN2-NEXT: .LBB57_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB57_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB57_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3198,12 +8430,34 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB57_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB57_3 +; GFX12-NEXT: s_branch .LBB57_4 +; GFX12-NEXT: .LBB57_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB57_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB57_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3217,40 +8471,107 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB58_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB58_4 +; GCN1-NEXT: .LBB58_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB58_2 +; GCN1-NEXT: .LBB58_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB58_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB58_4 +; GCN2-NEXT: .LBB58_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB58_2 +; GCN2-NEXT: .LBB58_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_addr64_offset: @@ -3260,13 +8581,37 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB58_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB58_4 +; GFX12-NEXT: .LBB58_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB58_2 +; GFX12-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3278,43 +8623,106 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB59_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB59_3 +; GCN1-NEXT: s_branch .LBB59_4 +; GCN1-NEXT: .LBB59_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB59_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB59_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB59_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB59_3 +; GCN2-NEXT: s_branch .LBB59_4 +; GCN2-NEXT: .LBB59_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB59_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB59_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3323,13 +8731,35 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB59_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB59_3 +; GFX12-NEXT: s_branch .LBB59_4 +; GFX12-NEXT: .LBB59_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB59_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB59_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3344,39 +8774,128 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB60_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB60_4 +; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB60_2 +; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB60_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB60_4 +; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB60_2 +; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB60_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB60_4 +; GFX12-NEXT: .LBB60_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB60_2 +; GFX12-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst @@ -3386,16 +8905,48 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB61_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB61_3 +; GCN1-NEXT: s_branch .LBB61_4 +; GCN1-NEXT: .LBB61_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB61_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB61_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3403,16 +8954,47 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB61_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB61_3 +; GCN2-NEXT: s_branch .LBB61_4 +; GCN2-NEXT: .LBB61_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB61_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB61_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3423,12 +9005,32 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB61_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB61_3 +; GFX12-NEXT: s_branch .LBB61_4 +; GFX12-NEXT: .LBB61_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB61_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB61_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3441,36 +9043,103 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB62_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB62_4 +; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB62_2 +; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB62_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB62_4 +; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB62_2 +; GCN2-NEXT: .LBB62_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_addr64: @@ -3478,15 +9147,38 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB62_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB62_4 +; GFX12-NEXT: .LBB62_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB62_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB62_2 +; GFX12-NEXT: .LBB62_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3497,54 +9189,138 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB63_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB63_3 +; GCN1-NEXT: s_branch .LBB63_4 +; GCN1-NEXT: .LBB63_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB63_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB63_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB63_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB63_3 +; GCN2-NEXT: s_branch .LBB63_4 +; GCN2-NEXT: .LBB63_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB63_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB63_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB63_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB63_3 +; GFX12-NEXT: s_branch .LBB63_4 +; GFX12-NEXT: .LBB63_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB63_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB63_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3558,43 +9334,123 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB64_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB64_4 +; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB64_2 +; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB64_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB64_4 +; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB64_2 +; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB64_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB64_4 +; GFX12-NEXT: .LBB64_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB64_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB64_2 +; GFX12-NEXT: .LBB64_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -3605,43 +9461,123 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN1-LABEL: atomic_xchg_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB65_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB65_4 +; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB65_2 +; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB65_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB65_4 +; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB65_2 +; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB65_4 +; GFX12-NEXT: .LBB65_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB65_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB65_2 +; GFX12-NEXT: .LBB65_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 @@ -3652,43 +9588,123 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN1-LABEL: atomic_xchg_pointer_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB66_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB66_4 +; GCN1-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB66_2 +; GCN1-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_pointer_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB66_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB66_4 +; GCN2-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB66_2 +; GCN2-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB66_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB66_4 +; GFX12-NEXT: .LBB66_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB66_2 +; GFX12-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr ptr, ptr %out, i32 4 @@ -3699,39 +9715,100 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB67_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB67_3 +; GCN1-NEXT: s_branch .LBB67_4 +; GCN1-NEXT: .LBB67_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB67_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB67_3 +; GCN2-NEXT: s_branch .LBB67_4 +; GCN2-NEXT: .LBB67_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s1 +; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3740,13 +9817,34 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB67_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB67_3 +; GFX12-NEXT: s_branch .LBB67_4 +; GFX12-NEXT: .LBB67_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB67_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB67_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3759,40 +9857,99 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB68_4 +; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB68_2 +; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB68_4 +; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB68_2 +; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: @@ -3802,13 +9959,34 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB68_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB68_4 +; GFX12-NEXT: .LBB68_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB68_2 +; GFX12-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3820,43 +9998,104 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB69_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB69_3 +; GCN1-NEXT: s_branch .LBB69_4 +; GCN1-NEXT: .LBB69_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB69_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s9 +; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB69_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB69_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB69_3 +; GCN2-NEXT: s_branch .LBB69_4 +; GCN2-NEXT: .LBB69_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB69_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s8 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s9 +; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB69_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3865,14 +10104,35 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB69_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB69_3 +; GFX12-NEXT: s_branch .LBB69_4 +; GFX12-NEXT: .LBB69_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB69_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB69_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3886,39 +10146,117 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB70_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB70_4 +; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB70_2 +; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB70_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB70_4 +; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB70_2 +; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB70_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB70_4 +; GFX12-NEXT: .LBB70_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB70_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB70_2 +; GFX12-NEXT: .LBB70_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst @@ -3928,35 +10266,96 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB71_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB71_3 +; GCN1-NEXT: s_branch .LBB71_4 +; GCN1-NEXT: .LBB71_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB71_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB71_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB71_3 +; GCN2-NEXT: s_branch .LBB71_4 +; GCN2-NEXT: .LBB71_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s1 +; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB71_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3965,13 +10364,32 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB71_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB71_3 +; GFX12-NEXT: s_branch .LBB71_4 +; GFX12-NEXT: .LBB71_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB71_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB71_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3983,36 +10401,95 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB72_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB72_4 +; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB72_2 +; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB72_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB72_4 +; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB72_2 +; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_addr64: @@ -4020,15 +10497,35 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB72_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB72_4 +; GFX12-NEXT: .LBB72_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB72_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB72_2 +; GFX12-NEXT: .LBB72_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4039,55 +10536,136 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB73_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB73_3 +; GCN1-NEXT: s_branch .LBB73_4 +; GCN1-NEXT: .LBB73_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB73_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s9 +; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB73_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB73_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB73_3 +; GCN2-NEXT: s_branch .LBB73_4 +; GCN2-NEXT: .LBB73_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB73_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s8 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s9 +; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB73_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB73_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB73_3 +; GFX12-NEXT: s_branch .LBB73_4 +; GFX12-NEXT: .LBB73_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB73_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB73_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -4100,43 +10678,134 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB74_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB74_4 +; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB74_2 +; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB74_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB74_4 +; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB74_2 +; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB74_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB74_4 +; GFX12-NEXT: .LBB74_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB74_2 +; GFX12-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -4147,39 +10816,102 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB75_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB75_3 +; GCN1-NEXT: s_branch .LBB75_4 +; GCN1-NEXT: .LBB75_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB75_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB75_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB75_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB75_3 +; GCN2-NEXT: s_branch .LBB75_4 +; GCN2-NEXT: .LBB75_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB75_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB75_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -4188,12 +10920,34 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB75_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB75_3 +; GFX12-NEXT: s_branch .LBB75_4 +; GFX12-NEXT: .LBB75_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB75_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB75_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4207,40 +10961,107 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB76_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB76_4 +; GCN1-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB76_2 +; GCN1-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB76_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB76_4 +; GCN2-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB76_2 +; GCN2-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_addr64_offset: @@ -4250,13 +11071,37 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB76_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB76_4 +; GFX12-NEXT: .LBB76_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB76_2 +; GFX12-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4268,43 +11113,106 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB77_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB77_3 +; GCN1-NEXT: s_branch .LBB77_4 +; GCN1-NEXT: .LBB77_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB77_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB77_3 +; GCN2-NEXT: s_branch .LBB77_4 +; GCN2-NEXT: .LBB77_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -4313,13 +11221,35 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB77_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB77_3 +; GFX12-NEXT: s_branch .LBB77_4 +; GFX12-NEXT: .LBB77_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB77_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB77_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4334,39 +11264,128 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB78_4 +; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB78_2 +; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB78_4 +; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB78_2 +; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB78_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB78_4 +; GFX12-NEXT: .LBB78_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB78_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB78_2 +; GFX12-NEXT: .LBB78_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst @@ -4376,16 +11395,48 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB79_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB79_3 +; GCN1-NEXT: s_branch .LBB79_4 +; GCN1-NEXT: .LBB79_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB79_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB79_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4393,16 +11444,47 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB79_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB79_3 +; GCN2-NEXT: s_branch .LBB79_4 +; GCN2-NEXT: .LBB79_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB79_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB79_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4413,12 +11495,32 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB79_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB79_3 +; GFX12-NEXT: s_branch .LBB79_4 +; GFX12-NEXT: .LBB79_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB79_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB79_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4431,36 +11533,103 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB80_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB80_4 +; GCN1-NEXT: .LBB80_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB80_2 +; GCN1-NEXT: .LBB80_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB80_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB80_4 +; GCN2-NEXT: .LBB80_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB80_2 +; GCN2-NEXT: .LBB80_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_addr64: @@ -4468,15 +11637,38 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB80_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB80_4 +; GFX12-NEXT: .LBB80_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB80_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB80_2 +; GFX12-NEXT: .LBB80_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4487,54 +11679,138 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB81_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB81_3 +; GCN1-NEXT: s_branch .LBB81_4 +; GCN1-NEXT: .LBB81_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB81_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB81_3 +; GCN2-NEXT: s_branch .LBB81_4 +; GCN2-NEXT: .LBB81_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB81_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB81_3 +; GFX12-NEXT: s_branch .LBB81_4 +; GFX12-NEXT: .LBB81_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB81_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB81_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -5908,43 +13184,143 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB107_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB107_4 +; GCN1-NEXT: .LBB107_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB107_2 +; GCN1-NEXT: .LBB107_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB107_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB107_4 +; GCN2-NEXT: .LBB107_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB107_2 +; GCN2-NEXT: .LBB107_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB107_4 +; GFX12-NEXT: .LBB107_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB107_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB107_2 +; GFX12-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -5955,39 +13331,108 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB108_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB108_3 +; GCN1-NEXT: s_branch .LBB108_4 +; GCN1-NEXT: .LBB108_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB108_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB108_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB108_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB108_3 +; GCN2-NEXT: s_branch .LBB108_4 +; GCN2-NEXT: .LBB108_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB108_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB108_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -5996,12 +13441,37 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB108_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB108_3 +; GFX12-NEXT: s_branch .LBB108_4 +; GFX12-NEXT: .LBB108_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB108_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB108_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6015,40 +13485,113 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB109_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB109_4 +; GCN1-NEXT: .LBB109_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB109_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB109_2 +; GCN1-NEXT: .LBB109_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB109_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB109_4 +; GCN2-NEXT: .LBB109_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB109_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB109_2 +; GCN2-NEXT: .LBB109_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_incr64_offset: @@ -6058,13 +13601,40 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB109_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB109_4 +; GFX12-NEXT: .LBB109_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB109_2 +; GFX12-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6076,43 +13646,112 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB110_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB110_3 +; GCN1-NEXT: s_branch .LBB110_4 +; GCN1-NEXT: .LBB110_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB110_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB110_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB110_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB110_3 +; GCN2-NEXT: s_branch .LBB110_4 +; GCN2-NEXT: .LBB110_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB110_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB110_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -6121,13 +13760,38 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB110_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB110_3 +; GFX12-NEXT: s_branch .LBB110_4 +; GFX12-NEXT: .LBB110_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB110_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB110_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6142,39 +13806,137 @@ entry: define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB111_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB111_4 +; GCN1-NEXT: .LBB111_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB111_2 +; GCN1-NEXT: .LBB111_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB111_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB111_4 +; GCN2-NEXT: .LBB111_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB111_2 +; GCN2-NEXT: .LBB111_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB111_4 +; GFX12-NEXT: .LBB111_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB111_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB111_2 +; GFX12-NEXT: .LBB111_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst @@ -6184,16 +13946,51 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB112_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB112_3 +; GCN1-NEXT: s_branch .LBB112_4 +; GCN1-NEXT: .LBB112_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB112_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB112_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6201,16 +13998,50 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB112_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB112_3 +; GCN2-NEXT: s_branch .LBB112_4 +; GCN2-NEXT: .LBB112_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB112_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB112_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6221,12 +14052,35 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB112_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB112_3 +; GFX12-NEXT: s_branch .LBB112_4 +; GFX12-NEXT: .LBB112_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB112_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB112_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6239,36 +14093,109 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB113_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB113_4 +; GCN1-NEXT: .LBB113_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB113_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB113_2 +; GCN1-NEXT: .LBB113_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB113_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB113_4 +; GCN2-NEXT: .LBB113_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB113_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB113_2 +; GCN2-NEXT: .LBB113_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_incr64: @@ -6276,15 +14203,41 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB113_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB113_4 +; GFX12-NEXT: .LBB113_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB113_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB113_2 +; GFX12-NEXT: .LBB113_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6295,54 +14248,147 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB114_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB114_3 +; GCN1-NEXT: s_branch .LBB114_4 +; GCN1-NEXT: .LBB114_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB114_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB114_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB114_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB114_3 +; GCN2-NEXT: s_branch .LBB114_4 +; GCN2-NEXT: .LBB114_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB114_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB114_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB114_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB114_3 +; GFX12-NEXT: s_branch .LBB114_4 +; GFX12-NEXT: .LBB114_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB114_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB114_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6356,43 +14402,154 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB115_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB115_4 +; GCN1-NEXT: .LBB115_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB115_2 +; GCN1-NEXT: .LBB115_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB115_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB115_4 +; GCN2-NEXT: .LBB115_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB115_2 +; GCN2-NEXT: .LBB115_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB115_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB115_4 +; GFX12-NEXT: .LBB115_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB115_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB115_2 +; GFX12-NEXT: .LBB115_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -6403,39 +14560,116 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB116_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB116_3 +; GCN1-NEXT: s_branch .LBB116_4 +; GCN1-NEXT: .LBB116_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB116_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB116_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB116_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB116_3 +; GCN2-NEXT: s_branch .LBB116_4 +; GCN2-NEXT: .LBB116_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB116_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB116_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -6443,13 +14677,43 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB116_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB116_3 +; GFX12-NEXT: s_branch .LBB116_4 +; GFX12-NEXT: .LBB116_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB116_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s4 +; GFX12-NEXT: .LBB116_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6463,40 +14727,119 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB117_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB117_4 +; GCN1-NEXT: .LBB117_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB117_2 +; GCN1-NEXT: .LBB117_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB117_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB117_4 +; GCN2-NEXT: .LBB117_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB117_2 +; GCN2-NEXT: .LBB117_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_decr64_offset: @@ -6506,13 +14849,45 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB117_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB117_4 +; GFX12-NEXT: .LBB117_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB117_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB117_2 +; GFX12-NEXT: .LBB117_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6524,43 +14899,120 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB118_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB118_3 +; GCN1-NEXT: s_branch .LBB118_4 +; GCN1-NEXT: .LBB118_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB118_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB118_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB118_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB118_3 +; GCN2-NEXT: s_branch .LBB118_4 +; GCN2-NEXT: .LBB118_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB118_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB118_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -6569,13 +15021,43 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB118_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB118_3 +; GFX12-NEXT: s_branch .LBB118_4 +; GFX12-NEXT: .LBB118_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB118_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s6, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 +; GFX12-NEXT: .LBB118_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6590,39 +15072,148 @@ entry: define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB119_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB119_4 +; GCN1-NEXT: .LBB119_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB119_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB119_2 +; GCN1-NEXT: .LBB119_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB119_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB119_4 +; GCN2-NEXT: .LBB119_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB119_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB119_2 +; GCN2-NEXT: .LBB119_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB119_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB119_4 +; GFX12-NEXT: .LBB119_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB119_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB119_2 +; GFX12-NEXT: .LBB119_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst @@ -6632,16 +15223,55 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB120_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB120_3 +; GCN1-NEXT: s_branch .LBB120_4 +; GCN1-NEXT: .LBB120_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB120_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB120_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6649,16 +15279,54 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB120_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB120_3 +; GCN2-NEXT: s_branch .LBB120_4 +; GCN2-NEXT: .LBB120_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB120_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB120_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6668,13 +15336,42 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s1 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_vccz .LBB120_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB120_3 +; GFX12-NEXT: s_branch .LBB120_4 +; GFX12-NEXT: .LBB120_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB120_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s4, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s4 +; GFX12-NEXT: .LBB120_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6687,36 +15384,115 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB121_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB121_4 +; GCN1-NEXT: .LBB121_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB121_2 +; GCN1-NEXT: .LBB121_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB121_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB121_4 +; GCN2-NEXT: .LBB121_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB121_2 +; GCN2-NEXT: .LBB121_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_decr64: @@ -6724,15 +15500,46 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB121_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB121_4 +; GFX12-NEXT: .LBB121_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB121_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB121_2 +; GFX12-NEXT: .LBB121_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6743,54 +15550,160 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB122_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB122_3 +; GCN1-NEXT: s_branch .LBB122_4 +; GCN1-NEXT: .LBB122_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB122_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB122_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB122_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB122_3 +; GCN2-NEXT: s_branch .LBB122_4 +; GCN2-NEXT: .LBB122_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB122_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB122_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB122_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB122_3 +; GFX12-NEXT: s_branch .LBB122_4 +; GFX12-NEXT: .LBB122_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB122_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s6, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 +; GFX12-NEXT: .LBB122_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index 64bd4804ccd51..d9c6e4ad5006a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -5005,7 +5005,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5061,7 +5061,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 9000 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5121,7 +5121,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 ret void @@ -5184,7 +5184,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5257,7 +5257,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 ret void @@ -5310,7 +5310,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5365,7 +5365,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 ret void @@ -5423,7 +5423,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5491,7 +5491,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 ret void @@ -5543,7 +5543,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %in, i64 4 - %val = load atomic double, ptr %gep seq_cst, align 8 + %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void } @@ -5589,7 +5589,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8 + %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void } @@ -5654,7 +5654,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 entry: %ptr = getelementptr double, ptr %in, i64 %index %gep = getelementptr double, ptr %ptr, i64 4 - %val = load atomic double, ptr %gep seq_cst, align 8 + %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void } @@ -5714,7 +5714,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %in, i64 %index - %val = load atomic double, ptr %ptr seq_cst, align 8 + %val = load atomic double, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void } @@ -5757,7 +5757,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 - store atomic double %in, ptr %gep seq_cst, align 8 + store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -5794,7 +5794,7 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - store atomic double %in, ptr %out seq_cst, align 8 + store atomic double %in, ptr %out seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -5850,7 +5850,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, entry: %ptr = getelementptr double, ptr %out, i64 %index %gep = getelementptr double, ptr %ptr, i64 4 - store atomic double %in, ptr %gep seq_cst, align 8 + store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -5901,7 +5901,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr double, ptr %out, i64 %index - store atomic double %in, ptr %ptr seq_cst, align 8 + store atomic double %in, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 3fd624b592cd4..d7bd4b1e4918e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -11,25 +11,100 @@ define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_xchg_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB0_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB0_4 +; GCN1-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB0_2 +; GCN1-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB0_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB0_4 +; GCN2-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB0_2 +; GCN2-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB0_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB0_4 +; GCN3-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB0_2 +; GCN3-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret void @@ -39,29 +114,106 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB1_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB1_4 +; GCN1-NEXT: .LBB1_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB1_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB1_2 +; GCN1-NEXT: .LBB1_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB1_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB1_4 +; GCN2-NEXT: .LBB1_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB1_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB1_2 +; GCN2-NEXT: .LBB1_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB1_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB1_4 +; GCN3-NEXT: .LBB1_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB1_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB1_2 +; GCN3-NEXT: .LBB1_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -72,25 +224,116 @@ define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_xchg_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB2_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB2_4 +; GCN1-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB2_2 +; GCN1-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB2_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB2_4 +; GCN2-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB2_2 +; GCN2-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB2_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB2_4 +; GCN3-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB2_2 +; GCN3-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret i64 %result @@ -100,29 +343,116 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB3_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB3_4 +; GCN1-NEXT: .LBB3_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB3_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB3_2 +; GCN1-NEXT: .LBB3_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB3_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB3_4 +; GCN2-NEXT: .LBB3_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB3_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB3_2 +; GCN2-NEXT: .LBB3_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB3_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB3_4 +; GCN3-NEXT: .LBB3_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB3_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB3_2 +; GCN3-NEXT: .LBB3_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -133,37 +463,112 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB4_4 +; GCN1-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB4_2 +; GCN1-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB4_4 +; GCN2-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB4_2 +; GCN2-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB4_4 +; GCN3-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB4_2 +; GCN3-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v1, s34 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret void @@ -173,41 +578,118 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB5_4 +; GCN1-NEXT: .LBB5_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB5_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB5_2 +; GCN1-NEXT: .LBB5_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB5_4 +; GCN2-NEXT: .LBB5_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB5_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB5_2 +; GCN2-NEXT: .LBB5_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB5_4 +; GCN3-NEXT: .LBB5_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB5_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB5_2 +; GCN3-NEXT: .LBB5_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v1, s34 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -218,37 +700,112 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_xchg_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB6_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB6_3 +; GCN1-NEXT: s_branch .LBB6_4 +; GCN1-NEXT: .LBB6_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB6_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s7 +; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB6_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB6_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB6_3 +; GCN2-NEXT: s_branch .LBB6_4 +; GCN2-NEXT: .LBB6_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB6_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s7 +; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB6_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB6_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB6_3 +; GCN3-NEXT: s_branch .LBB6_4 +; GCN3-NEXT: .LBB6_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB6_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB6_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret i64 %result @@ -258,41 +815,118 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB7_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB7_3 +; GCN1-NEXT: s_branch .LBB7_4 +; GCN1-NEXT: .LBB7_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s7 +; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB7_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB7_3 +; GCN2-NEXT: s_branch .LBB7_4 +; GCN2-NEXT: .LBB7_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s7 +; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB7_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB7_3 +; GCN3-NEXT: s_branch .LBB7_4 +; GCN3-NEXT: .LBB7_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -303,29 +937,106 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB8_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB8_4 +; GCN1-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB8_2 +; GCN1-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB8_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB8_4 +; GCN2-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB8_2 +; GCN2-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB8_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB8_4 +; GCN3-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB8_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB8_2 +; GCN3-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -336,29 +1047,116 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB9_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB9_4 +; GCN1-NEXT: .LBB9_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB9_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB9_2 +; GCN1-NEXT: .LBB9_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB9_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB9_4 +; GCN2-NEXT: .LBB9_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB9_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB9_2 +; GCN2-NEXT: .LBB9_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB9_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB9_4 +; GCN3-NEXT: .LBB9_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB9_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB9_2 +; GCN3-NEXT: .LBB9_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -373,25 +1171,100 @@ define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) { ; GCN1-LABEL: flat_atomic_xchg_f64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB10_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB10_4 +; GCN1-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB10_2 +; GCN1-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB10_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB10_4 +; GCN2-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB10_2 +; GCN2-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB10_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB10_4 +; GCN3-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB10_2 +; GCN3-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst ret void @@ -401,29 +1274,106 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB11_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB11_4 +; GCN1-NEXT: .LBB11_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB11_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB11_2 +; GCN1-NEXT: .LBB11_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB11_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB11_4 +; GCN2-NEXT: .LBB11_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB11_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB11_2 +; GCN2-NEXT: .LBB11_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB11_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB11_4 +; GCN3-NEXT: .LBB11_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB11_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB11_2 +; GCN3-NEXT: .LBB11_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst @@ -434,25 +1384,116 @@ define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) { ; GCN1-LABEL: flat_atomic_xchg_f64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB12_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB12_4 +; GCN1-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB12_2 +; GCN1-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB12_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB12_4 +; GCN2-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB12_2 +; GCN2-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB12_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB12_4 +; GCN3-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB12_2 +; GCN3-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, double %in seq_cst ret double %result @@ -462,29 +1503,116 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB13_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB13_4 +; GCN1-NEXT: .LBB13_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB13_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB13_2 +; GCN1-NEXT: .LBB13_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB13_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB13_4 +; GCN2-NEXT: .LBB13_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB13_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB13_2 +; GCN2-NEXT: .LBB13_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB13_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB13_4 +; GCN3-NEXT: .LBB13_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB13_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB13_2 +; GCN3-NEXT: .LBB13_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst @@ -495,37 +1623,112 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double ; GCN1-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB14_4 +; GCN1-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB14_2 +; GCN1-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB14_4 +; GCN2-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB14_2 +; GCN2-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB14_4 +; GCN3-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB14_2 +; GCN3-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v1, s34 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst ret void @@ -535,41 +1738,118 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB15_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB15_4 +; GCN1-NEXT: .LBB15_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB15_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB15_2 +; GCN1-NEXT: .LBB15_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB15_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB15_4 +; GCN2-NEXT: .LBB15_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB15_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB15_2 +; GCN2-NEXT: .LBB15_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB15_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB15_4 +; GCN3-NEXT: .LBB15_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB15_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB15_2 +; GCN3-NEXT: .LBB15_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v1, s34 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst @@ -580,37 +1860,112 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double ; GCN1-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB16_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB16_3 +; GCN1-NEXT: s_branch .LBB16_4 +; GCN1-NEXT: .LBB16_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB16_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s7 +; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB16_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB16_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB16_3 +; GCN2-NEXT: s_branch .LBB16_4 +; GCN2-NEXT: .LBB16_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB16_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s7 +; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB16_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB16_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB16_3 +; GCN3-NEXT: s_branch .LBB16_4 +; GCN3-NEXT: .LBB16_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB16_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB16_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, double %in seq_cst ret double %result @@ -620,41 +1975,118 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB17_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB17_3 +; GCN1-NEXT: s_branch .LBB17_4 +; GCN1-NEXT: .LBB17_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s7 +; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB17_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB17_3 +; GCN2-NEXT: s_branch .LBB17_4 +; GCN2-NEXT: .LBB17_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s7 +; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB17_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB17_3 +; GCN3-NEXT: s_branch .LBB17_4 +; GCN3-NEXT: .LBB17_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst @@ -665,29 +2097,106 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB18_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB18_4 +; GCN1-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB18_2 +; GCN1-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB18_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB18_4 +; GCN2-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB18_2 +; GCN2-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB18_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB18_4 +; GCN3-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB18_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB18_2 +; GCN3-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0 @@ -698,29 +2207,116 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB19_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB19_4 +; GCN1-NEXT: .LBB19_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB19_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB19_2 +; GCN1-NEXT: .LBB19_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB19_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB19_4 +; GCN2-NEXT: .LBB19_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB19_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB19_2 +; GCN2-NEXT: .LBB19_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB19_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB19_4 +; GCN3-NEXT: .LBB19_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB19_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB19_2 +; GCN3-NEXT: .LBB19_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0 @@ -735,25 +2331,118 @@ define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_add_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB20_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB20_4 +; GCN1-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB20_2 +; GCN1-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB20_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB20_4 +; GCN2-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB20_2 +; GCN2-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB20_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB20_4 +; GCN3-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB20_2 +; GCN3-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst ret void @@ -763,29 +2452,124 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_add_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB21_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB21_4 +; GCN1-NEXT: .LBB21_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB21_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB21_2 +; GCN1-NEXT: .LBB21_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB21_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB21_4 +; GCN2-NEXT: .LBB21_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB21_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB21_2 +; GCN2-NEXT: .LBB21_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB21_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB21_4 +; GCN3-NEXT: .LBB21_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB21_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB21_2 +; GCN3-NEXT: .LBB21_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst @@ -796,25 +2580,127 @@ define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_add_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB22_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB22_4 +; GCN1-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB22_2 +; GCN1-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB22_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB22_4 +; GCN2-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB22_2 +; GCN2-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB22_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB22_4 +; GCN3-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB22_2 +; GCN3-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr %ptr, i64 %in seq_cst ret i64 %result @@ -824,29 +2710,127 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB23_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB23_4 +; GCN1-NEXT: .LBB23_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB23_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB23_2 +; GCN1-NEXT: .LBB23_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB23_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB23_4 +; GCN2-NEXT: .LBB23_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB23_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB23_2 +; GCN2-NEXT: .LBB23_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB23_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB23_4 +; GCN3-NEXT: .LBB23_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB23_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB23_2 +; GCN3-NEXT: .LBB23_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst @@ -857,37 +2841,127 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB24_4 +; GCN1-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB24_2 +; GCN1-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB24_4 +; GCN2-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB24_2 +; GCN2-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB24_4 +; GCN3-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB24_2 +; GCN3-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst ret void @@ -897,41 +2971,133 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB25_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB25_4 +; GCN1-NEXT: .LBB25_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB25_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB25_2 +; GCN1-NEXT: .LBB25_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB25_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB25_4 +; GCN2-NEXT: .LBB25_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB25_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB25_2 +; GCN2-NEXT: .LBB25_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB25_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB25_4 +; GCN3-NEXT: .LBB25_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB25_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB25_2 +; GCN3-NEXT: .LBB25_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst @@ -942,37 +3108,121 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB26_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_setpc_b64 s[30:31] -; +; GCN1-NEXT: s_cbranch_execz .LBB26_3 +; GCN1-NEXT: s_branch .LBB26_4 +; GCN1-NEXT: .LBB26_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB26_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB26_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; ; GCN2-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB26_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB26_3 +; GCN2-NEXT: s_branch .LBB26_4 +; GCN2-NEXT: .LBB26_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB26_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB26_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB26_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB26_3 +; GCN3-NEXT: s_branch .LBB26_4 +; GCN3-NEXT: .LBB26_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB26_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, s6, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB26_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr %ptr, i64 %in seq_cst ret i64 %result @@ -982,41 +3232,127 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB27_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB27_3 +; GCN1-NEXT: s_branch .LBB27_4 +; GCN1-NEXT: .LBB27_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB27_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB27_3 +; GCN2-NEXT: s_branch .LBB27_4 +; GCN2-NEXT: .LBB27_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB27_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB27_3 +; GCN3-NEXT: s_branch .LBB27_4 +; GCN3-NEXT: .LBB27_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, s6, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst @@ -1027,29 +3363,124 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB28_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB28_4 +; GCN1-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB28_2 +; GCN1-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB28_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB28_4 +; GCN2-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB28_2 +; GCN2-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB28_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB28_4 +; GCN3-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB28_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB28_2 +; GCN3-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1060,29 +3491,127 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB29_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB29_4 +; GCN1-NEXT: .LBB29_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB29_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB29_2 +; GCN1-NEXT: .LBB29_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB29_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB29_4 +; GCN2-NEXT: .LBB29_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB29_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB29_2 +; GCN2-NEXT: .LBB29_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB29_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB29_4 +; GCN3-NEXT: .LBB29_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB29_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB29_2 +; GCN3-NEXT: .LBB29_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1097,25 +3626,118 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_sub_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB30_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB30_4 +; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB30_2 +; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB30_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB30_4 +; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB30_2 +; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB30_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB30_4 +; GCN3-NEXT: .LBB30_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB30_2 +; GCN3-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst ret void @@ -1125,29 +3747,124 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB31_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB31_4 +; GCN1-NEXT: .LBB31_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB31_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB31_2 +; GCN1-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB31_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB31_4 +; GCN2-NEXT: .LBB31_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB31_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB31_2 +; GCN2-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB31_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB31_4 +; GCN3-NEXT: .LBB31_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB31_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB31_2 +; GCN3-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1158,25 +3875,127 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_4 +; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB32_2 +; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_4 +; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB32_2 +; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_4 +; GCN3-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB32_2 +; GCN3-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1186,29 +4005,127 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_4 +; GCN1-NEXT: .LBB33_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB33_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB33_2 +; GCN1-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_4 +; GCN2-NEXT: .LBB33_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB33_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB33_2 +; GCN2-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_4 +; GCN3-NEXT: .LBB33_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB33_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB33_2 +; GCN3-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1219,37 +4136,127 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB34_4 +; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB34_2 +; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB34_4 +; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB34_2 +; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB34_4 +; GCN3-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB34_2 +; GCN3-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_subrev_co_u32_e32 v1, vcc, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst ret void @@ -1259,41 +4266,133 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB35_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB35_4 +; GCN1-NEXT: .LBB35_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB35_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB35_2 +; GCN1-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB35_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB35_4 +; GCN2-NEXT: .LBB35_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB35_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB35_2 +; GCN2-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB35_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB35_4 +; GCN3-NEXT: .LBB35_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB35_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB35_2 +; GCN3-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_subrev_co_u32_e32 v1, vcc, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1304,37 +4403,121 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB36_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB36_3 +; GCN1-NEXT: s_branch .LBB36_4 +; GCN1-NEXT: .LBB36_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB36_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB36_3 +; GCN2-NEXT: s_branch .LBB36_4 +; GCN2-NEXT: .LBB36_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB36_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB36_3 +; GCN3-NEXT: s_branch .LBB36_4 +; GCN3-NEXT: .LBB36_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1344,41 +4527,127 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB37_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB37_3 +; GCN1-NEXT: s_branch .LBB37_4 +; GCN1-NEXT: .LBB37_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB37_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB37_3 +; GCN2-NEXT: s_branch .LBB37_4 +; GCN2-NEXT: .LBB37_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB37_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB37_3 +; GCN3-NEXT: s_branch .LBB37_4 +; GCN3-NEXT: .LBB37_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1389,29 +4658,124 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB38_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB38_4 +; GCN1-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB38_2 +; GCN1-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB38_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB38_4 +; GCN2-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB38_2 +; GCN2-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB38_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB38_4 +; GCN3-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB38_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB38_2 +; GCN3-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1422,29 +4786,127 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB39_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB39_4 +; GCN1-NEXT: .LBB39_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB39_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB39_2 +; GCN1-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB39_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB39_4 +; GCN2-NEXT: .LBB39_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB39_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB39_2 +; GCN2-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB39_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB39_4 +; GCN3-NEXT: .LBB39_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB39_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB39_2 +; GCN3-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1459,25 +4921,118 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_and_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB40_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB40_4 +; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB40_2 +; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB40_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB40_4 +; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB40_2 +; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB40_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB40_4 +; GCN3-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB40_2 +; GCN3-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst ret void @@ -1487,29 +5042,124 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_and_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB41_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB41_4 +; GCN1-NEXT: .LBB41_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB41_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB41_2 +; GCN1-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB41_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB41_4 +; GCN2-NEXT: .LBB41_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB41_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB41_2 +; GCN2-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB41_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB41_4 +; GCN3-NEXT: .LBB41_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB41_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB41_2 +; GCN3-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1520,25 +5170,127 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_and_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB42_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB42_4 +; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB42_2 +; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB42_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB42_4 +; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB42_2 +; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB42_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB42_4 +; GCN3-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB42_2 +; GCN3-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1548,29 +5300,127 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB43_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB43_4 +; GCN1-NEXT: .LBB43_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB43_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB43_2 +; GCN1-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB43_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB43_4 +; GCN2-NEXT: .LBB43_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB43_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB43_2 +; GCN2-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB43_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB43_4 +; GCN3-NEXT: .LBB43_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB43_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB43_2 +; GCN3-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1581,37 +5431,124 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB44_4 +; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB44_2 +; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB44_4 +; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB44_2 +; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB44_4 +; GCN3-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB44_2 +; GCN3-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst ret void @@ -1621,41 +5558,130 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB45_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB45_4 +; GCN1-NEXT: .LBB45_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB45_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB45_2 +; GCN1-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB45_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB45_4 +; GCN2-NEXT: .LBB45_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB45_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB45_2 +; GCN2-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB45_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB45_4 +; GCN3-NEXT: .LBB45_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB45_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB45_2 +; GCN3-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1666,37 +5692,118 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB46_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB46_3 +; GCN1-NEXT: s_branch .LBB46_4 +; GCN1-NEXT: .LBB46_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB46_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB46_3 +; GCN2-NEXT: s_branch .LBB46_4 +; GCN2-NEXT: .LBB46_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB46_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB46_3 +; GCN3-NEXT: s_branch .LBB46_4 +; GCN3-NEXT: .LBB46_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1706,41 +5813,124 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB47_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB47_3 +; GCN1-NEXT: s_branch .LBB47_4 +; GCN1-NEXT: .LBB47_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB47_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB47_3 +; GCN2-NEXT: s_branch .LBB47_4 +; GCN2-NEXT: .LBB47_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB47_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB47_3 +; GCN3-NEXT: s_branch .LBB47_4 +; GCN3-NEXT: .LBB47_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1751,29 +5941,124 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_4 +; GCN1-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB48_2 +; GCN1-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_4 +; GCN2-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB48_2 +; GCN2-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_4 +; GCN3-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB48_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB48_2 +; GCN3-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1784,29 +6069,127 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_4 +; GCN1-NEXT: .LBB49_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB49_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB49_2 +; GCN1-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_4 +; GCN2-NEXT: .LBB49_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB49_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB49_2 +; GCN2-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_4 +; GCN3-NEXT: .LBB49_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB49_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB49_2 +; GCN3-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1821,12 +6204,26 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_6 +; GCN1-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1838,23 +6235,58 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB50_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB50_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB50_2 +; GCN1-NEXT: .LBB50_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_6 +; GCN2-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1866,20 +6298,53 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB50_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB50_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB50_2 +; GCN2-NEXT: .LBB50_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB50_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB50_6 +; GCN3-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1891,12 +6356,32 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB50_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB50_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB50_2 +; GCN3-NEXT: .LBB50_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst ret void @@ -1906,17 +6391,211 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB51_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB51_6 +; GCN1-NEXT: .LBB51_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB51_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB51_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN1-NEXT: v_not_b32_e32 v5, v4 +; GCN1-NEXT: v_not_b32_e32 v4, v8 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB51_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB51_2 +; GCN1-NEXT: .LBB51_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB51_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB51_6 +; GCN2-NEXT: .LBB51_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB51_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB51_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN2-NEXT: v_not_b32_e32 v5, v4 +; GCN2-NEXT: v_not_b32_e32 v4, v8 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB51_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB51_2 +; GCN2-NEXT: .LBB51_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB51_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB51_6 +; GCN3-NEXT: .LBB51_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB51_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB51_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN3-NEXT: v_not_b32_e32 v5, v4 +; GCN3-NEXT: v_not_b32_e32 v4, v8 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB51_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB51_2 +; GCN3-NEXT: .LBB51_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst + ret void +} + +define void @flat_atomic_nand_i64_noret_offset__noalias_private(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 ; GCN1-NEXT: v_and_b32_e32 v1, v6, v2 ; GCN1-NEXT: v_not_b32_e32 v5, v0 ; GCN1-NEXT: v_not_b32_e32 v4, v1 @@ -1928,12 +6607,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_noret_offset: +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 @@ -1943,7 +6622,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 @@ -1958,17 +6637,17 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i64_noret_offset: +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1983,12 +6662,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1996,12 +6675,21 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB53_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: flat_load_dword v5, v[5:6] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB53_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -2014,24 +6702,56 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB52_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB53_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: .LBB53_4: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB53_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB53_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB53_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: flat_load_dword v5, v[5:6] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB53_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -2044,40 +6764,96 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB52_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB53_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: .LBB53_4: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB53_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB53_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB53_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB53_6 +; GCN3-NEXT: .LBB53_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB53_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB53_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 -; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 -; GCN3-NEXT: v_not_b32_e32 v5, v4 -; GCN3-NEXT: v_not_b32_e32 v4, v8 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN3-NEXT: v_not_b32_e32 v7, v0 +; GCN3-NEXT: v_not_b32_e32 v6, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB52_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB53_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB53_2 +; GCN3-NEXT: .LBB53_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2087,6 +6863,203 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB54_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB54_6 +; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN1-NEXT: v_not_b32_e32 v7, v0 +; GCN1-NEXT: v_not_b32_e32 v6, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB54_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB54_2 +; GCN1-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB54_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB54_6 +; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN2-NEXT: v_not_b32_e32 v7, v0 +; GCN2-NEXT: v_not_b32_e32 v6, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB54_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB54_2 +; GCN2-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB54_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB54_6 +; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN3-NEXT: v_not_b32_e32 v7, v0 +; GCN3-NEXT: v_not_b32_e32 v6, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB54_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB54_2 +; GCN3-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst + ret i64 %result +} + +define i64 @flat_atomic_nand_i64_ret_offset__noalias_private(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 @@ -2094,7 +7067,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -2109,12 +7082,12 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_ret_offset: +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 @@ -2124,7 +7097,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -2139,17 +7112,17 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i64_ret_offset: +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -2164,14 +7137,14 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw nand ptr %gep, i64 %in seq_cst + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2179,18 +7152,30 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-LABEL: flat_atomic_nand_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB56_6 +; GCN1-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB56_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2205,26 +7190,57 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB54_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB56_4 +; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB56_2 +; GCN1-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB56_6 +; GCN2-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB56_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2239,21 +7255,49 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB54_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB56_4 +; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB56_2 +; GCN2-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB56_6 +; GCN3-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB56_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2268,9 +7312,25 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB54_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB56_4 +; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB56_2 +; GCN3-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst ret void @@ -2280,6 +7340,201 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 +; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB57_6 +; GCN1-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_not_b32_e32 v1, v0 +; GCN1-NEXT: v_not_b32_e32 v0, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB57_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB57_2 +; GCN1-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 +; GCN2-NEXT: s_add_u32 s34, s4, 32 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB57_6 +; GCN2-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_not_b32_e32 v1, v0 +; GCN2-NEXT: v_not_b32_e32 v0, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB57_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB57_2 +; GCN2-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB57_6 +; GCN3-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v0 +; GCN3-NEXT: v_not_b32_e32 v0, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB57_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB57_2 +; GCN3-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst + ret void +} + +define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace(ptr inreg %out, i64 inreg %in) { +; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: s_add_u32 s36, s4, 36 @@ -2291,7 +7546,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2306,12 +7561,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 @@ -2325,7 +7580,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2340,12 +7595,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 @@ -2354,7 +7609,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2369,12 +7624,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2382,18 +7637,24 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_nand_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB59_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB59_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -2408,26 +7669,55 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB56_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB59_2 +; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB59_6 +; GCN1-NEXT: .LBB59_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB59_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: v_not_b32_e32 v4, v4 +; GCN1-NEXT: v_not_b32_e32 v5, v5 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB59_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB59_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB59_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -2442,21 +7732,47 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB56_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB59_2 +; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB59_6 +; GCN2-NEXT: .LBB59_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB59_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN2-NEXT: v_not_b32_e32 v4, v4 +; GCN2-NEXT: v_not_b32_e32 v5, v5 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB59_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB59_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB59_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -2471,9 +7787,29 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB56_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB59_2 +; GCN3-NEXT: ; %bb.3: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB59_6 +; GCN3-NEXT: .LBB59_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB59_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN3-NEXT: v_not_b32_e32 v4, v4 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB59_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2483,6 +7819,195 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 +; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB60_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN1-NEXT: v_not_b32_e32 v5, v0 +; GCN1-NEXT: v_not_b32_e32 v4, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB60_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB60_6 +; GCN1-NEXT: .LBB60_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB60_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: v_not_b32_e32 v4, v4 +; GCN1-NEXT: v_not_b32_e32 v5, v5 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 +; GCN2-NEXT: s_add_u32 s34, s4, 32 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB60_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN2-NEXT: v_not_b32_e32 v5, v0 +; GCN2-NEXT: v_not_b32_e32 v4, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB60_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB60_6 +; GCN2-NEXT: .LBB60_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB60_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN2-NEXT: v_not_b32_e32 v4, v4 +; GCN2-NEXT: v_not_b32_e32 v5, v5 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB60_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN3-NEXT: v_not_b32_e32 v5, v0 +; GCN3-NEXT: v_not_b32_e32 v4, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB60_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB60_6 +; GCN3-NEXT: .LBB60_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB60_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN3-NEXT: v_not_b32_e32 v4, v4 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar__noalias_private(ptr inreg %out, i64 inreg %in) { +; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: s_add_u32 s36, s4, 36 @@ -2494,7 +8019,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -2509,12 +8034,12 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: s_cbranch_execnz .LBB61_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 @@ -2528,7 +8053,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -2543,12 +8068,12 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: s_cbranch_execnz .LBB61_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 @@ -2557,7 +8082,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -2572,12 +8097,12 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: s_cbranch_execnz .LBB61_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw nand ptr %gep, i64 %in seq_cst + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2585,86 +8110,190 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB62_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB62_6 +; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB62_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 -; GCN1-NEXT: v_and_b32_e32 v1, v6, v2 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN1-NEXT: v_not_b32_e32 v5, v4 +; GCN1-NEXT: v_not_b32_e32 v4, v8 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB58_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB62_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB62_2 +; GCN1-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB62_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB62_6 +; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB62_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 -; GCN2-NEXT: v_and_b32_e32 v1, v6, v2 -; GCN2-NEXT: v_not_b32_e32 v5, v0 -; GCN2-NEXT: v_not_b32_e32 v4, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN2-NEXT: v_not_b32_e32 v5, v4 +; GCN2-NEXT: v_not_b32_e32 v4, v8 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB58_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB62_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB62_2 +; GCN2-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB62_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB62_6 +; GCN3-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB62_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 ; GCN3-NEXT: v_not_b32_e32 v5, v4 ; GCN3-NEXT: v_not_b32_e32 v4, v8 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB58_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB62_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB62_2 +; GCN3-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2675,14 +8304,29 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB63_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB63_6 +; GCN1-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB63_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -2695,24 +8339,60 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB59_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB63_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB63_2 +; GCN1-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB63_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB63_6 +; GCN2-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB63_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -2725,38 +8405,92 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB59_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB63_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB63_2 +; GCN2-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB63_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB63_6 +; GCN3-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB63_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 -; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 -; GCN3-NEXT: v_not_b32_e32 v5, v4 -; GCN3-NEXT: v_not_b32_e32 v4, v8 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN3-NEXT: v_not_b32_e32 v7, v0 +; GCN3-NEXT: v_not_b32_e32 v6, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB59_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB63_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB63_2 +; GCN3-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2771,25 +8505,118 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB64_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB64_4 +; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB64_2 +; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB64_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB64_4 +; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB64_2 +; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB64_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB64_4 +; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB64_2 +; GCN3-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst ret void @@ -2799,29 +8626,124 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB65_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB65_4 +; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB65_2 +; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB65_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB65_4 +; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB65_2 +; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB65_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB65_4 +; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB65_2 +; GCN3-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2832,25 +8754,127 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB66_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB66_4 +; GCN1-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB66_2 +; GCN1-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB66_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB66_4 +; GCN2-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB66_2 +; GCN2-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB66_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB66_4 +; GCN3-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB66_2 +; GCN3-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2860,29 +8884,127 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB67_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB67_4 +; GCN1-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB67_2 +; GCN1-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB67_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB67_4 +; GCN2-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB67_2 +; GCN2-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB67_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB67_4 +; GCN3-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB67_2 +; GCN3-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2893,37 +9015,124 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN1-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB68_4 +; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB68_2 +; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB68_4 +; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB68_2 +; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB68_4 +; GCN3-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB68_2 +; GCN3-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst ret void @@ -2933,41 +9142,130 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN1-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB69_4 +; GCN1-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB69_2 +; GCN1-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB69_4 +; GCN2-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB69_2 +; GCN2-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB69_4 +; GCN3-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB69_2 +; GCN3-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2978,37 +9276,118 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB70_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB70_3 +; GCN1-NEXT: s_branch .LBB70_4 +; GCN1-NEXT: .LBB70_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB70_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB70_3 +; GCN2-NEXT: s_branch .LBB70_4 +; GCN2-NEXT: .LBB70_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB70_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB70_3 +; GCN3-NEXT: s_branch .LBB70_4 +; GCN3-NEXT: .LBB70_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst ret i64 %result @@ -3018,41 +9397,124 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB71_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB71_3 +; GCN1-NEXT: s_branch .LBB71_4 +; GCN1-NEXT: .LBB71_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB71_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB71_3 +; GCN2-NEXT: s_branch .LBB71_4 +; GCN2-NEXT: .LBB71_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB71_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB71_3 +; GCN3-NEXT: s_branch .LBB71_4 +; GCN3-NEXT: .LBB71_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst @@ -3063,29 +9525,124 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB72_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB72_4 +; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB72_2 +; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB72_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB72_4 +; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB72_2 +; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB72_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB72_4 +; GCN3-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB72_2 +; GCN3-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3095,30 +9652,128 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB73_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB73_4 +; GCN1-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB73_2 +; GCN1-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB73_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB73_4 +; GCN2-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB73_2 +; GCN2-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB73_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB73_4 +; GCN3-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB73_2 +; GCN3-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3133,25 +9788,118 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_xor_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB74_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB74_4 +; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB74_2 +; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB74_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB74_4 +; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB74_2 +; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB74_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB74_4 +; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB74_2 +; GCN3-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst ret void @@ -3161,29 +9909,124 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB75_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB75_4 +; GCN1-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB75_2 +; GCN1-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB75_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB75_4 +; GCN2-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB75_2 +; GCN2-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB75_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB75_4 +; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB75_2 +; GCN3-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -3194,25 +10037,127 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB76_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB76_4 +; GCN1-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB76_2 +; GCN1-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB76_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB76_4 +; GCN2-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB76_2 +; GCN2-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB76_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB76_4 +; GCN3-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB76_2 +; GCN3-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst ret i64 %result @@ -3222,29 +10167,127 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB77_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB77_4 +; GCN1-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB77_2 +; GCN1-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB77_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB77_4 +; GCN2-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB77_2 +; GCN2-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB77_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB77_4 +; GCN3-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB77_2 +; GCN3-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -3255,37 +10298,124 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB78_4 +; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB78_2 +; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB78_4 +; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB78_2 +; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB78_4 +; GCN3-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB78_2 +; GCN3-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst ret void @@ -3295,41 +10425,130 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB79_4 +; GCN1-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB79_2 +; GCN1-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB79_4 +; GCN2-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB79_2 +; GCN2-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB79_4 +; GCN3-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB79_2 +; GCN3-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -3340,37 +10559,118 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB80_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB80_3 +; GCN1-NEXT: s_branch .LBB80_4 +; GCN1-NEXT: .LBB80_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB80_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB80_3 +; GCN2-NEXT: s_branch .LBB80_4 +; GCN2-NEXT: .LBB80_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB80_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB80_3 +; GCN3-NEXT: s_branch .LBB80_4 +; GCN3-NEXT: .LBB80_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst ret i64 %result @@ -3380,41 +10680,124 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB81_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB81_3 +; GCN1-NEXT: s_branch .LBB81_4 +; GCN1-NEXT: .LBB81_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB81_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB81_3 +; GCN2-NEXT: s_branch .LBB81_4 +; GCN2-NEXT: .LBB81_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB81_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB81_3 +; GCN3-NEXT: s_branch .LBB81_4 +; GCN3-NEXT: .LBB81_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -3425,29 +10808,124 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB82_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB82_4 +; GCN1-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB82_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB82_2 +; GCN1-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB82_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB82_4 +; GCN2-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB82_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB82_2 +; GCN2-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB82_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB82_4 +; GCN3-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB82_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB82_2 +; GCN3-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3458,29 +10936,127 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB83_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB83_4 +; GCN1-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB83_2 +; GCN1-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB83_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB83_4 +; GCN2-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB83_2 +; GCN2-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB83_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB83_4 +; GCN3-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB83_2 +; GCN3-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3500,7 +11076,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3514,7 +11090,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3527,7 +11103,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3541,7 +11117,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3551,7 +11127,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3565,11 +11141,11 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3584,7 +11160,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3598,7 +11174,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3613,7 +11189,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3627,7 +11203,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3637,7 +11213,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3651,12 +11227,12 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3669,7 +11245,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -3683,7 +11259,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_cbranch_execnz .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -3698,7 +11274,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -3712,7 +11288,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_cbranch_execnz .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -3724,7 +11300,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -3738,13 +11314,13 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_cbranch_execnz .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw max ptr %ptr, i64 %in seq_cst + %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3759,7 +11335,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -3773,7 +11349,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3788,7 +11364,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -3802,7 +11378,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3812,7 +11388,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -3826,14 +11402,14 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_cbranch_execnz .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3854,7 +11430,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3868,7 +11444,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3889,7 +11465,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3903,7 +11479,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3919,7 +11495,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3933,11 +11509,11 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3958,7 +11534,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3972,7 +11548,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3993,7 +11569,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -4007,7 +11583,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4023,7 +11599,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -4037,12 +11613,12 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4063,7 +11639,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4077,7 +11653,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4098,7 +11674,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4112,7 +11688,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4128,7 +11704,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4142,11 +11718,11 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw max ptr %ptr, i64 %in seq_cst + %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4167,7 +11743,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4181,7 +11757,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4202,7 +11778,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4216,7 +11792,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4232,7 +11808,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4246,12 +11822,12 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4272,7 +11848,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -4286,7 +11862,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cbranch_execnz .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4306,7 +11882,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -4320,7 +11896,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cbranch_execnz .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4338,7 +11914,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -4352,13 +11928,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cbranch_execnz .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4378,7 +11954,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -4392,7 +11968,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_cbranch_execnz .LBB93_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4415,7 +11991,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -4429,7 +12005,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_cbranch_execnz .LBB93_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4450,7 +12026,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -4464,7 +12040,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_cbranch_execnz .LBB93_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -4474,7 +12050,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -4494,7 +12070,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -4508,7 +12084,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cbranch_execnz .LBB94_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4526,7 +12102,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -4540,7 +12116,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cbranch_execnz .LBB94_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4558,7 +12134,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -4572,12 +12148,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cbranch_execnz .LBB94_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4595,7 +12171,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -4609,7 +12185,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_cbranch_execnz .LBB95_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4630,7 +12206,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -4644,7 +12220,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_cbranch_execnz .LBB95_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4665,7 +12241,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -4679,7 +12255,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_cbranch_execnz .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -4688,7 +12264,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -4704,7 +12280,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -4718,7 +12294,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cbranch_execnz .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4733,7 +12309,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -4747,7 +12323,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cbranch_execnz .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4757,7 +12333,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -4771,12 +12347,12 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cbranch_execnz .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -4791,7 +12367,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4805,7 +12381,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_cbranch_execnz .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4820,7 +12396,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4834,7 +12410,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_cbranch_execnz .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4844,7 +12420,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -4858,14 +12434,14 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_cbranch_execnz .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -4882,7 +12458,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4896,7 +12472,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4909,7 +12485,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4923,7 +12499,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4933,7 +12509,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4947,11 +12523,11 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4966,7 +12542,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4980,7 +12556,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4995,7 +12571,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -5009,7 +12585,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5019,7 +12595,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -5033,12 +12609,12 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5051,7 +12627,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -5065,7 +12641,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_cbranch_execnz .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -5080,7 +12656,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -5094,7 +12670,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_cbranch_execnz .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -5106,7 +12682,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -5120,13 +12696,13 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_cbranch_execnz .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umax ptr %ptr, i64 %in seq_cst + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5141,7 +12717,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5155,7 +12731,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5170,7 +12746,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5184,7 +12760,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5194,7 +12770,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -5208,14 +12784,14 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_cbranch_execnz .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5236,7 +12812,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5250,7 +12826,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5271,7 +12847,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5285,7 +12861,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5301,7 +12877,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5315,11 +12891,11 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5340,7 +12916,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5354,7 +12930,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5375,7 +12951,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5389,7 +12965,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5405,7 +12981,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5419,12 +12995,12 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5445,7 +13021,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5459,7 +13035,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5480,7 +13056,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5494,7 +13070,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5510,7 +13086,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5524,11 +13100,11 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umax ptr %ptr, i64 %in seq_cst + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5549,7 +13125,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5563,7 +13139,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_cbranch_execnz .LBB105_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5584,7 +13160,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5598,7 +13174,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_cbranch_execnz .LBB105_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5614,7 +13190,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5628,12 +13204,12 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_cbranch_execnz .LBB105_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5654,7 +13230,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -5668,7 +13244,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cbranch_execnz .LBB106_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5688,7 +13264,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -5702,7 +13278,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cbranch_execnz .LBB106_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5720,7 +13296,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5734,13 +13310,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cbranch_execnz .LBB106_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5760,7 +13336,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -5774,7 +13350,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -5797,7 +13373,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -5811,7 +13387,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5832,7 +13408,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -5846,7 +13422,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -5856,7 +13432,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -5875,7 +13451,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -5889,7 +13465,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -5910,7 +13486,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -5924,7 +13500,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5945,7 +13521,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -5959,7 +13535,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -5968,7 +13544,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -5984,7 +13560,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -5998,7 +13574,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6013,7 +13589,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -6027,7 +13603,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6037,7 +13613,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -6051,12 +13627,12 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -6071,7 +13647,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6085,7 +13661,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6100,7 +13676,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6114,7 +13690,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6124,7 +13700,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6138,14 +13714,14 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -6162,7 +13738,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6176,7 +13752,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6189,7 +13765,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6203,7 +13779,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6213,7 +13789,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6227,11 +13803,11 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6246,7 +13822,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6260,7 +13836,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6275,7 +13851,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6289,7 +13865,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6299,7 +13875,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6313,12 +13889,12 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6331,7 +13907,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -6345,7 +13921,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -6360,7 +13936,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -6374,7 +13950,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -6386,7 +13962,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6400,13 +13976,13 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umin ptr %ptr, i64 %in seq_cst + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6421,7 +13997,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6435,7 +14011,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6450,7 +14026,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6464,7 +14040,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6474,7 +14050,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6488,14 +14064,14 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6516,7 +14092,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6530,7 +14106,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6551,7 +14127,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6565,7 +14141,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6581,7 +14157,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6595,11 +14171,11 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6620,7 +14196,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6634,7 +14210,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6655,7 +14231,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6669,7 +14245,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6685,7 +14261,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6699,12 +14275,12 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6725,7 +14301,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6739,7 +14315,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6760,7 +14336,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6774,7 +14350,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6790,7 +14366,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6804,11 +14380,11 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umin ptr %ptr, i64 %in seq_cst + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6829,7 +14405,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6843,7 +14419,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6864,7 +14440,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6878,7 +14454,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6894,7 +14470,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6908,12 +14484,12 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6928,7 +14504,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6942,7 +14518,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6957,7 +14533,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6971,7 +14547,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6981,7 +14557,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6995,12 +14571,12 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -7015,7 +14591,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -7029,7 +14605,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7044,7 +14620,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -7058,7 +14634,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7068,7 +14644,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -7082,14 +14658,14 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -7106,7 +14682,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7120,7 +14696,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7133,7 +14709,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7147,7 +14723,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7157,7 +14733,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7171,11 +14747,11 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7190,7 +14766,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7204,7 +14780,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7219,7 +14795,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7233,7 +14809,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7243,7 +14819,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7257,12 +14833,12 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7275,7 +14851,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -7289,7 +14865,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: s_cbranch_execnz .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -7304,7 +14880,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -7318,7 +14894,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: s_cbranch_execnz .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -7330,7 +14906,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -7344,13 +14920,13 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: s_cbranch_execnz .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw min ptr %ptr, i64 %in seq_cst + %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7365,7 +14941,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -7379,7 +14955,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7394,7 +14970,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -7408,7 +14984,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7418,7 +14994,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -7432,14 +15008,14 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: s_cbranch_execnz .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7460,7 +15036,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7474,7 +15050,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7495,7 +15071,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7509,7 +15085,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7525,7 +15101,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7539,11 +15115,11 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7564,7 +15140,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7578,7 +15154,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7599,7 +15175,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7613,7 +15189,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7629,7 +15205,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7643,12 +15219,12 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7669,7 +15245,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -7683,7 +15259,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB123_1 +; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7704,7 +15280,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -7718,7 +15294,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB123_1 +; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7734,7 +15310,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -7748,11 +15324,11 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB123_1 +; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw min ptr %ptr, i64 %in seq_cst + %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7773,7 +15349,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -7787,7 +15363,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB124_1 +; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7808,7 +15384,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -7822,7 +15398,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB124_1 +; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7838,7 +15414,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -7852,12 +15428,12 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB124_1 +; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7878,7 +15454,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -7892,7 +15468,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB125_1 +; GCN1-NEXT: s_cbranch_execnz .LBB129_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -7912,7 +15488,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -7926,7 +15502,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB125_1 +; GCN2-NEXT: s_cbranch_execnz .LBB129_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -7944,7 +15520,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7958,13 +15534,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB125_1 +; GCN3-NEXT: s_cbranch_execnz .LBB129_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7984,7 +15560,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -7998,7 +15574,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB126_1 +; GCN1-NEXT: s_cbranch_execnz .LBB130_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -8021,7 +15597,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -8035,7 +15611,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB126_1 +; GCN2-NEXT: s_cbranch_execnz .LBB130_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -8056,7 +15632,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -8070,7 +15646,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB126_1 +; GCN3-NEXT: s_cbranch_execnz .LBB130_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -8080,7 +15656,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -8098,7 +15674,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB131_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -8112,7 +15688,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB127_1 +; GCN1-NEXT: s_cbranch_execnz .LBB131_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -8128,7 +15704,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB131_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -8142,7 +15718,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB127_1 +; GCN2-NEXT: s_cbranch_execnz .LBB131_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -8158,7 +15734,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB131_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -8172,11 +15748,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB127_1 +; GCN3-NEXT: s_cbranch_execnz .LBB131_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8194,7 +15770,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB132_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -8208,7 +15784,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB128_1 +; GCN1-NEXT: s_cbranch_execnz .LBB132_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -8229,7 +15805,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB132_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -8243,7 +15819,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB128_1 +; GCN2-NEXT: s_cbranch_execnz .LBB132_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -8264,7 +15840,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB132_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -8278,7 +15854,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB128_1 +; GCN3-NEXT: s_cbranch_execnz .LBB132_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -8287,7 +15863,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -8303,7 +15879,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB133_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -8317,7 +15893,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: s_cbranch_execnz .LBB133_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -8332,7 +15908,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB133_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -8346,7 +15922,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: s_cbranch_execnz .LBB133_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -8356,7 +15932,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB133_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -8370,12 +15946,12 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: s_cbranch_execnz .LBB133_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -8390,7 +15966,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB134_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -8404,7 +15980,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: s_cbranch_execnz .LBB134_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -8419,7 +15995,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB134_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -8433,7 +16009,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: s_cbranch_execnz .LBB134_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -8443,7 +16019,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB134_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -8457,14 +16033,14 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: s_cbranch_execnz .LBB134_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -8496,7 +16072,7 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8529,7 +16105,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8557,7 +16133,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8590,7 +16166,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8630,7 +16206,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8675,7 +16251,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8715,7 +16291,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8760,7 +16336,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8793,7 +16369,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -8826,7 +16402,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -8858,7 +16434,7 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8891,7 +16467,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8919,7 +16495,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8952,7 +16528,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8992,7 +16568,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9037,7 +16613,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9077,7 +16653,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9122,7 +16698,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9155,7 +16731,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -9188,8 +16764,9 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index e45b5cb30ab89..973ca51667928 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1278,7 +1278,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1365,7 +1365,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1390,7 +1390,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1419,7 +1419,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1459,7 +1459,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1 ret void } @@ -1666,3 +1666,4 @@ attributes #3 = { "denormal-fp-math"="ieee,ieee" } attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 786fe03164690..68ebc21e2ba4d 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -37,12 +37,11 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -62,34 +61,33 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3] -; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 -; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 +; SDAG-NEXT: v_mul_lo_u32 v6, v10, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_add3_u32 v5, v5, v6, v12 ; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5] ; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] -; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB0_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 @@ -102,9 +100,9 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v7, v4 ; SDAG-NEXT: v_mov_b32_e32 v4, v2 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] @@ -112,7 +110,7 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 ; SDAG-NEXT: .LBB0_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] @@ -409,12 +407,11 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -434,34 +431,33 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 -; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0 +; SDAG-NEXT: v_mul_lo_u32 v12, v8, v2 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 -; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3] -; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 -; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 +; SDAG-NEXT: v_mul_lo_u32 v6, v10, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v10, v[2:3] +; SDAG-NEXT: v_mul_lo_u32 v10, v9, v7 +; SDAG-NEXT: v_add3_u32 v5, v5, v6, v12 ; SDAG-NEXT: v_mov_b32_e32 v6, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[4:5] ; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v11 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] -; SDAG-NEXT: ; implicit-def: $vgpr11 ; SDAG-NEXT: ; implicit-def: $vgpr8 -; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 +; SDAG-NEXT: v_add3_u32 v4, v9, v4, v10 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr9 ; SDAG-NEXT: .LBB1_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 @@ -474,9 +470,9 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v7, v4 ; SDAG-NEXT: v_mov_b32_e32 v4, v2 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] @@ -484,7 +480,7 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 ; SDAG-NEXT: .LBB1_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] @@ -780,7 +776,6 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] @@ -806,24 +801,24 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 -; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] ; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 +; SDAG-NEXT: v_mul_lo_u32 v7, v9, v12 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -1138,7 +1133,6 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] @@ -1164,24 +1158,24 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0 -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 -; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5] -; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] ; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 -; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 -; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6] +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v11 +; SDAG-NEXT: v_mul_lo_u32 v7, v9, v12 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v8, v[5:6] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -1551,26 +1545,25 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 -; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 -; SDAG-NEXT: v_mov_b32_e32 v10, v5 +; SDAG-NEXT: v_mov_b32_e32 v8, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v10, v5 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 ; SDAG-NEXT: v_mul_lo_u32 v3, v6, v11 +; SDAG-NEXT: v_mul_lo_u32 v7, v6, v12 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v7, v8, v12 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 @@ -1903,26 +1896,25 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 +; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] -; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 -; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 -; SDAG-NEXT: v_mov_b32_e32 v10, v5 +; SDAG-NEXT: v_mov_b32_e32 v8, v5 ; SDAG-NEXT: v_mov_b32_e32 v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, -1, s[4:5] ; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v6, s[4:5], -1, v10 ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 ; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v10, v5 +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v8, v5 ; SDAG-NEXT: v_mul_lo_u32 v3, v6, v11 +; SDAG-NEXT: v_mul_lo_u32 v7, v6, v12 ; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mul_lo_u32 v7, v8, v12 ; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll index 469d0453b9dfb..ae309f3a614d5 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll @@ -6,7 +6,6 @@ declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) -declare void @llvm.amdgcn.cs.chain(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { ; GISEL-GFX11-LABEL: name: chain_to_chain @@ -20,9 +19,15 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] @@ -30,8 +35,8 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY7]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY10]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: chain_to_chain ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -44,20 +49,26 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY8]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY11]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: chain_to_chain ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -136,9 +147,15 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] @@ -146,8 +163,8 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY7]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY10]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: cs_to_chain ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -160,20 +177,26 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY8]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY11]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: cs_to_chain ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -252,9 +275,15 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] @@ -262,8 +291,8 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY7]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY10]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -276,20 +305,26 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY8]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY11]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -368,9 +403,15 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] @@ -378,8 +419,8 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY7]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY10]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -392,20 +433,26 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY8]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY11]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -487,9 +534,15 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY7]] @@ -510,15 +563,21 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY7]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY8]] - ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; GISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY12]] ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[REG_SEQUENCE]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: indirect @@ -600,6 +659,93 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, unreachable } +; Indirect with callee that we cannot prove is uniform. +define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 %vgpr) { + ; GISEL-GFX11-LABEL: name: nonuniform_callee + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY2]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + ; + ; GISEL-GFX10-LABEL: name: nonuniform_callee + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY2]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: nonuniform_callee + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $vgpr8, $vgpr9, $sgpr0, $vgpr10 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE1]], 0, 0, killed [[S_MOV_B32_]], amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + ; + ; DAGISEL-GFX10-LABEL: name: nonuniform_callee + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $vgpr8, $vgpr9, $sgpr0, $vgpr10 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 killed [[REG_SEQUENCE1]], 0, 0, killed [[S_MOV_B32_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $vgpr8 + call void(ptr, i32, i32, i32, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 -1, i32 inreg %sgpr, i32 %vgpr, i32 0) + unreachable +} + define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { ; GISEL-GFX11-LABEL: name: non_imm_exec ; GISEL-GFX11: bb.1 (%ir-block.0): @@ -613,9 +759,15 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY5]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY6]] @@ -623,8 +775,8 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY8]], @callee, 0, [[COPY]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W32 [[COPY11]], @callee, 0, [[COPY]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: non_imm_exec ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -638,20 +790,26 @@ define amdgpu_cs_chain void @non_imm_exec(i32 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY6]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY7]] - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY8]] + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY11]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY9]], @callee, 0, [[COPY]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[COPY12]], @callee, 0, [[COPY]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: non_imm_exec ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -734,9 +892,15 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY4]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY6]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY7]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY8]] @@ -758,15 +922,21 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i32 i ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY4]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY6]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY7]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY8]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY9]] - ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] + ; GISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY13]] ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W32 [[REG_SEQUENCE]], 0, 0, [[COPY2]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: indirect_with_non_imm_exec diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll index 51c28a02b7f82..90707e823c147 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll @@ -6,7 +6,6 @@ declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) declare amdgpu_cs_chain_preserve void @callee_preserve(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 }) -declare void @llvm.amdgcn.cs.chain(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) noreturn define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { ; GISEL-GFX11-LABEL: name: chain_to_chain @@ -20,9 +19,15 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] @@ -30,8 +35,8 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY7]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY10]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: chain_to_chain ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -44,20 +49,26 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY8]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY11]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: chain_to_chain ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -136,9 +147,15 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] @@ -146,8 +163,8 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY7]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY10]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: cs_to_chain ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -160,20 +177,26 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY8]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY11]], @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: cs_to_chain ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -252,9 +275,15 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] @@ -262,8 +291,8 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY7]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY10]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: chain_to_chain_preserve ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -276,20 +305,26 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY8]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY11]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: chain_to_chain_preserve ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -368,9 +403,15 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY5]] @@ -378,8 +419,8 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY7]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY10]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: cs_to_chain_preserve ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -392,20 +433,26 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY1]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY4]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY6]] - ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY8]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY11]], @callee_preserve, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: cs_to_chain_preserve ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -487,9 +534,15 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY7]] @@ -510,15 +563,21 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY7]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY8]] - ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; GISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY12]] ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[REG_SEQUENCE]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: indirect @@ -600,6 +659,92 @@ define amdgpu_cs_chain void @indirect(ptr inreg %callee, <3 x i32> inreg %sgpr, unreachable } +; Indirect with callee that we cannot prove is uniform. +define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32 %vgpr) { + ; GISEL-GFX11-LABEL: name: nonuniform_callee + ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10 + ; GISEL-GFX11-NEXT: {{ $}} + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY2]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + ; + ; GISEL-GFX10-LABEL: name: nonuniform_callee + ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10 + ; GISEL-GFX10-NEXT: {{ $}} + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; GISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE]] + ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; GISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY2]], 0, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; + ; DAGISEL-GFX11-LABEL: name: nonuniform_callee + ; DAGISEL-GFX11: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-NEXT: liveins: $vgpr8, $vgpr9, $sgpr0, $vgpr10 + ; DAGISEL-GFX11-NEXT: {{ $}} + ; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY]] + ; DAGISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], 0, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + ; + ; DAGISEL-GFX10-LABEL: name: nonuniform_callee + ; DAGISEL-GFX10: bb.0 (%ir-block.0): + ; DAGISEL-GFX10-NEXT: liveins: $vgpr8, $vgpr9, $sgpr0, $vgpr10 + ; DAGISEL-GFX10-NEXT: {{ $}} + ; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr10 + ; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9 + ; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_1]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_]], %subreg.sub1 + ; DAGISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY]] + ; DAGISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 killed [[REG_SEQUENCE1]], 0, 0, killed [[S_MOV_B64_]], amdgpu_allvgprs, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $sgpr0, implicit $vgpr8 + call void(ptr, i64, i32, i32, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i64 -1, i32 inreg %sgpr, i32 %vgpr, i32 0) + unreachable +} define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr) { ; GISEL-GFX11-LABEL: name: non_imm_exec ; GISEL-GFX11: bb.1 (%ir-block.0): @@ -615,9 +760,15 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY2]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY3]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY6]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY7]] @@ -625,8 +776,8 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE1]] - ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY9]], @callee, 0, [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 + ; GISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE1]] + ; GISEL-GFX11-NEXT: SI_CS_CHAIN_TC_W64 [[COPY12]], @callee, 0, [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; ; GISEL-GFX10-LABEL: name: non_imm_exec ; GISEL-GFX10: bb.1 (%ir-block.0): @@ -642,20 +793,26 @@ define amdgpu_cs_chain void @non_imm_exec(i64 inreg %exec, <3 x i32> inreg %sgpr ; GISEL-GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY3]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY5]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY6]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY7]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY8]] - ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY9]] + ; GISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY12]] ; GISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; GISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; GISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 - ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE1]] - ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY10]], @callee, 0, [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:ccr_sgpr_64 = COPY [[REG_SEQUENCE1]] + ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[COPY13]], @callee, 0, [[REG_SEQUENCE]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: non_imm_exec ; DAGISEL-GFX11: bb.0 (%ir-block.0): @@ -744,9 +901,15 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i ; GISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY4]] - ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] - ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX11-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY7]] ; GISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY8]] ; GISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY9]] @@ -770,15 +933,21 @@ define amdgpu_cs_chain void @indirect_with_non_imm_exec(ptr inreg %callee, i64 i ; GISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr9 ; GISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr10 ; GISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr11 - ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY4]] - ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] - ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GISEL-GFX10-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY7]] ; GISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY8]] ; GISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY9]] ; GISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY10]] - ; GISEL-GFX10-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 - ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY11]] + ; GISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; GISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY14]] ; GISEL-GFX10-NEXT: SI_CS_CHAIN_TC_W64 [[REG_SEQUENCE]], 0, 0, [[REG_SEQUENCE1]], amdgpu_allvgprs, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr48_sgpr49_sgpr50_sgpr51 ; ; DAGISEL-GFX11-LABEL: name: indirect_with_non_imm_exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll index 8bff17b729927..049cc455ab01c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -24,6 +24,166 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) { ret void } +; GFX10PLUS-LABEL: {{^}}dpp8_i64: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) { + %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1) + store i64 %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v2i32: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_v2i32(<2 x i32> %in, ptr addrspace(1) %out) { + %tmp0 = call <2 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<2 x i32> %in, i32 1) + store <2 x i32> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v3i32: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off +define amdgpu_ps void @dpp8_v3i32(<3 x i32> %in, ptr addrspace(1) %out) { + %tmp0 = call <3 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<3 x i32> %in, i32 1) + store <3 x i32> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v4i32: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off +define amdgpu_ps void @dpp8_v4i32(<4 x i32> %in, ptr addrspace(1) %out) { + %tmp0 = call <4 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<4 x i32> %in, i32 1) + store <4 x i32> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_p0: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_p0(ptr %in, ptr addrspace(1) %out) { + %tmp0 = call ptr @llvm.amdgcn.mov.dpp8.p0(ptr %in, i32 1) + store ptr %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_p3: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off +define amdgpu_ps void @dpp8_p3(ptr addrspace(3) %in, ptr addrspace(1) %out) { + %tmp0 = call ptr addrspace(3) @llvm.amdgcn.mov.dpp8.v3p3(ptr addrspace(3) %in, i32 1) + store ptr addrspace(3) %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v3p3: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off +define amdgpu_ps void @dpp8_v3p3(<3 x ptr addrspace(3)> %in, ptr addrspace(1) %out) { + %tmp0 = call <3 x ptr addrspace(3)> @llvm.amdgcn.mov.dpp8.v3p3(<3 x ptr addrspace(3)> %in, i32 1) + store <3 x ptr addrspace(3)> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_i16: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off +define amdgpu_ps void @dpp8_i16(i16 %in, ptr addrspace(1) %out) { + %tmp0 = call i16 @llvm.amdgcn.mov.dpp8.i16(i16 %in, i32 1) + store i16 %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v4i16: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_v4i16(<4 x i16> %in, ptr addrspace(1) %out) { + %tmp0 = call <4 x i16> @llvm.amdgcn.mov.dpp8.v4i16(<4 x i16> %in, i32 1) + store <4 x i16> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v4f16: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_v4f16(<4 x half> %in, ptr addrspace(1) %out) { + %tmp0 = call <4 x half> @llvm.amdgcn.mov.dpp8.v4f16(<4 x half> %in, i32 1) + store <4 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_float: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off +define amdgpu_ps void @dpp8_float(float %in, ptr addrspace(1) %out) { + %tmp0 = call float @llvm.amdgcn.mov.dpp8.f32(float %in, i32 1) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v3f32: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off +define amdgpu_ps void @dpp8_v3f32(<3 x float> %in, ptr addrspace(1) %out) { + %tmp0 = call <3 x float> @llvm.amdgcn.mov.dpp8.v3f32(<3 x float> %in, i32 1) + store <3 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_half: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off +define amdgpu_ps void @dpp8_half(half %in, ptr addrspace(1) %out) { + %tmp0 = call half @llvm.amdgcn.mov.dpp8.f16(half %in, i32 1) + store half %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_bfloat: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off +define amdgpu_ps void @dpp8_bfloat(bfloat %in, ptr addrspace(1) %out) { + %tmp0 = call bfloat @llvm.amdgcn.mov.dpp8.bf16(bfloat %in, i32 1) + store bfloat %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_v4bf16: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { + %tmp0 = call <4 x bfloat> @llvm.amdgcn.mov.dpp8.v4bf16(<4 x bfloat> %in, i32 1) + store <4 x bfloat> %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_double: +; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) { + %tmp0 = call double @llvm.amdgcn.mov.dpp8.f64(double %in, i32 1) + store double %tmp0, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll index 46254994580d2..9bb8a2f9f0282 100644 --- a/llvm/test/CodeGen/AMDGPU/while-break.ll +++ b/llvm/test/CodeGen/AMDGPU/while-break.ll @@ -162,8 +162,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i ; GCN-NEXT: s_branch .LBB2_2 ; GCN-NEXT: .LBB2_1: ; %Flow1 ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GCN-NEXT: s_and_b32 s1, exec_lo, s4 +; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN-NEXT: s_and_b32 s1, exec_lo, s1 ; GCN-NEXT: s_or_b32 s2, s1, s2 ; GCN-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GCN-NEXT: s_cbranch_execz .LBB2_6 @@ -190,20 +190,17 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i ; GCN-NEXT: .LBB2_4: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN-NEXT: v_mov_b32_e32 v7, v6 -; GCN-NEXT: s_mov_b32 s4, -1 -; GCN-NEXT: s_and_saveexec_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s1, -1 +; GCN-NEXT: s_and_saveexec_b32 s4, s3 ; GCN-NEXT: s_cbranch_execz .LBB2_1 ; GCN-NEXT: ; %bb.5: ; %latch ; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v3 -; GCN-NEXT: v_mov_b32_e32 v7, v0 ; GCN-NEXT: s_add_i32 s0, s0, 1 -; GCN-NEXT: s_orn2_b32 s4, vcc_lo, exec_lo +; GCN-NEXT: s_orn2_b32 s1, vcc_lo, exec_lo ; GCN-NEXT: s_branch .LBB2_1 ; GCN-NEXT: .LBB2_6: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GCN-NEXT: v_mov_b32_e32 v0, v7 ; GCN-NEXT: v_mov_b32_e32 v1, v6 ; GCN-NEXT: ; return to shader part epilog entry: diff --git a/llvm/test/CodeGen/ARM/cmse-clear-float-hard.ll b/llvm/test/CodeGen/ARM/cmse-clear-float-hard.ll index 606859db0a0ea..f97fc51a0c457 100644 --- a/llvm/test/CodeGen/ARM/cmse-clear-float-hard.ll +++ b/llvm/test/CodeGen/ARM/cmse-clear-float-hard.ll @@ -187,7 +187,7 @@ define float @f2(ptr nocapture %fptr) #2 { ; CHECK-8M-NEXT: bic r0, r0, #1 ; CHECK-8M-NEXT: sub sp, #136 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: ldr r1, [sp, #64] ; CHECK-8M-NEXT: bic r1, r1, #159 @@ -207,7 +207,7 @@ define float @f2(ptr nocapture %fptr) #2 { ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -245,7 +245,7 @@ define double @d2(ptr nocapture %fptr) #2 { ; CHECK-8M-LE-NEXT: bic r0, r0, #1 ; CHECK-8M-LE-NEXT: sub sp, #136 ; CHECK-8M-LE-NEXT: vmov r11, r12, d0 -; CHECK-8M-LE-NEXT: vlstm sp +; CHECK-8M-LE-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-LE-NEXT: vmov d0, r11, r12 ; CHECK-8M-LE-NEXT: ldr r1, [sp, #64] ; CHECK-8M-LE-NEXT: bic r1, r1, #159 @@ -264,7 +264,7 @@ define double @d2(ptr nocapture %fptr) #2 { ; CHECK-8M-LE-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-LE-NEXT: blxns r0 ; CHECK-8M-LE-NEXT: vmov r11, r12, d0 -; CHECK-8M-LE-NEXT: vlldm sp +; CHECK-8M-LE-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-LE-NEXT: vmov d0, r11, r12 ; CHECK-8M-LE-NEXT: add sp, #136 ; CHECK-8M-LE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -283,7 +283,7 @@ define double @d2(ptr nocapture %fptr) #2 { ; CHECK-8M-BE-NEXT: bic r0, r0, #1 ; CHECK-8M-BE-NEXT: sub sp, #136 ; CHECK-8M-BE-NEXT: vmov r11, r12, d0 -; CHECK-8M-BE-NEXT: vlstm sp +; CHECK-8M-BE-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-BE-NEXT: vmov d0, r11, r12 ; CHECK-8M-BE-NEXT: ldr r1, [sp, #64] ; CHECK-8M-BE-NEXT: bic r1, r1, #159 @@ -302,7 +302,7 @@ define double @d2(ptr nocapture %fptr) #2 { ; CHECK-8M-BE-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-BE-NEXT: blxns r0 ; CHECK-8M-BE-NEXT: vmov r11, r12, d0 -; CHECK-8M-BE-NEXT: vlldm sp +; CHECK-8M-BE-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-BE-NEXT: vmov d0, r11, r12 ; CHECK-8M-BE-NEXT: add sp, #136 ; CHECK-8M-BE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -368,7 +368,7 @@ define float @f3(ptr nocapture %fptr) #4 { ; CHECK-8M-NEXT: bic r0, r0, #1 ; CHECK-8M-NEXT: sub sp, #136 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: ldr r1, [sp, #64] ; CHECK-8M-NEXT: bic r1, r1, #159 @@ -388,7 +388,7 @@ define float @f3(ptr nocapture %fptr) #4 { ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -426,7 +426,7 @@ define double @d3(ptr nocapture %fptr) #4 { ; CHECK-8M-LE-NEXT: bic r0, r0, #1 ; CHECK-8M-LE-NEXT: sub sp, #136 ; CHECK-8M-LE-NEXT: vmov r11, r12, d0 -; CHECK-8M-LE-NEXT: vlstm sp +; CHECK-8M-LE-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-LE-NEXT: vmov d0, r11, r12 ; CHECK-8M-LE-NEXT: ldr r1, [sp, #64] ; CHECK-8M-LE-NEXT: bic r1, r1, #159 @@ -445,7 +445,7 @@ define double @d3(ptr nocapture %fptr) #4 { ; CHECK-8M-LE-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-LE-NEXT: blxns r0 ; CHECK-8M-LE-NEXT: vmov r11, r12, d0 -; CHECK-8M-LE-NEXT: vlldm sp +; CHECK-8M-LE-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-LE-NEXT: vmov d0, r11, r12 ; CHECK-8M-LE-NEXT: add sp, #136 ; CHECK-8M-LE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -464,7 +464,7 @@ define double @d3(ptr nocapture %fptr) #4 { ; CHECK-8M-BE-NEXT: bic r0, r0, #1 ; CHECK-8M-BE-NEXT: sub sp, #136 ; CHECK-8M-BE-NEXT: vmov r11, r12, d0 -; CHECK-8M-BE-NEXT: vlstm sp +; CHECK-8M-BE-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-BE-NEXT: vmov d0, r11, r12 ; CHECK-8M-BE-NEXT: ldr r1, [sp, #64] ; CHECK-8M-BE-NEXT: bic r1, r1, #159 @@ -483,7 +483,7 @@ define double @d3(ptr nocapture %fptr) #4 { ; CHECK-8M-BE-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-BE-NEXT: blxns r0 ; CHECK-8M-BE-NEXT: vmov r11, r12, d0 -; CHECK-8M-BE-NEXT: vlldm sp +; CHECK-8M-BE-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-BE-NEXT: vmov d0, r11, r12 ; CHECK-8M-BE-NEXT: add sp, #136 ; CHECK-8M-BE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -547,8 +547,9 @@ define float @f4(ptr nocapture %fptr) #6 { ; CHECK-8M-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11} ; CHECK-8M-NEXT: bic r0, r0, #1 ; CHECK-8M-NEXT: sub sp, #136 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vmov.f32 s0, s0 ; CHECK-8M-NEXT: mov r1, r0 +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: mov r2, r0 ; CHECK-8M-NEXT: mov r3, r0 ; CHECK-8M-NEXT: mov r4, r0 @@ -563,7 +564,7 @@ define float @f4(ptr nocapture %fptr) #6 { ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -598,8 +599,9 @@ define double @d4(ptr nocapture %fptr) #6 { ; CHECK-8M-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11} ; CHECK-8M-NEXT: bic r0, r0, #1 ; CHECK-8M-NEXT: sub sp, #136 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vmov.f32 s0, s0 ; CHECK-8M-NEXT: mov r1, r0 +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: mov r2, r0 ; CHECK-8M-NEXT: mov r3, r0 ; CHECK-8M-NEXT: mov r4, r0 @@ -614,7 +616,7 @@ define double @d4(ptr nocapture %fptr) #6 { ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 ; CHECK-8M-NEXT: vmov r11, r12, d0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov d0, r11, r12 ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -649,7 +651,7 @@ define void @fd(ptr %f, float %a, double %b) #8 { ; CHECK-8M-NEXT: vmov r12, s0 ; CHECK-8M-NEXT: mov r2, r0 ; CHECK-8M-NEXT: vmov r10, r11, d1 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: vmov d1, r10, r11 ; CHECK-8M-NEXT: ldr r1, [sp, #64] @@ -666,7 +668,7 @@ define void @fd(ptr %f, float %a, double %b) #8 { ; CHECK-8M-NEXT: mov r9, r0 ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} ; CHECK-8M-NEXT: pop {r7, pc} @@ -708,7 +710,7 @@ define void @fdff(ptr %f, float %a, double %b, float %c, float %d) #8 { ; CHECK-8M-NEXT: vmov r9, s1 ; CHECK-8M-NEXT: mov r4, r0 ; CHECK-8M-NEXT: vmov r8, s4 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: vmov d1, r10, r11 ; CHECK-8M-NEXT: vmov s1, r9 @@ -723,7 +725,7 @@ define void @fdff(ptr %f, float %a, double %b, float %c, float %d) #8 { ; CHECK-8M-NEXT: mov r7, r0 ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} ; CHECK-8M-NEXT: pop {r7, pc} @@ -765,7 +767,7 @@ define void @fidififid(ptr %fu, float %a, i32 %b, double %c, i32 %d, float %e, i ; CHECK-8M-NEXT: vmov r8, s1 ; CHECK-8M-NEXT: vmov r7, s4 ; CHECK-8M-NEXT: vmov r5, r6, d3 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r11 ; CHECK-8M-NEXT: vmov d1, r9, r10 ; CHECK-8M-NEXT: vmov s1, r8 @@ -778,7 +780,7 @@ define void @fidififid(ptr %fu, float %a, i32 %b, double %c, i32 %d, float %e, i ; CHECK-8M-NEXT: mov r4, r12 ; CHECK-8M-NEXT: msr apsr_nzcvqg, r12 ; CHECK-8M-NEXT: blxns r12 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} ; CHECK-8M-NEXT: pop {r7, pc} @@ -897,7 +899,7 @@ define half @h2(ptr nocapture %hptr) nounwind { ; CHECK-8M-NEXT: bic r0, r0, #1 ; CHECK-8M-NEXT: sub sp, #136 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: ldr r1, [sp, #64] ; CHECK-8M-NEXT: bic r1, r1, #159 @@ -917,7 +919,7 @@ define half @h2(ptr nocapture %hptr) nounwind { ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -976,7 +978,7 @@ define half @h3(ptr nocapture %hptr) nounwind { ; CHECK-8M-NEXT: bic r0, r0, #1 ; CHECK-8M-NEXT: sub sp, #136 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: ldr r1, [sp, #64] ; CHECK-8M-NEXT: bic r1, r1, #159 @@ -996,7 +998,7 @@ define half @h3(ptr nocapture %hptr) nounwind { ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -1053,8 +1055,9 @@ define half @h4(ptr nocapture %hptr) nounwind { ; CHECK-8M-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11} ; CHECK-8M-NEXT: bic r0, r0, #1 ; CHECK-8M-NEXT: sub sp, #136 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vmov.f32 s0, s0 ; CHECK-8M-NEXT: mov r1, r0 +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: mov r2, r0 ; CHECK-8M-NEXT: mov r3, r0 ; CHECK-8M-NEXT: mov r4, r0 @@ -1069,7 +1072,7 @@ define half @h4(ptr nocapture %hptr) nounwind { ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -1176,7 +1179,7 @@ define half @h1_arg(ptr nocapture %hptr, half %harg) nounwind { ; CHECK-8M-NEXT: bic r0, r0, #1 ; CHECK-8M-NEXT: sub sp, #136 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlstm sp +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: ldr r1, [sp, #64] ; CHECK-8M-NEXT: bic r1, r1, #159 @@ -1196,7 +1199,7 @@ define half @h1_arg(ptr nocapture %hptr, half %harg) nounwind { ; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 ; CHECK-8M-NEXT: blxns r0 ; CHECK-8M-NEXT: vmov r12, s0 -; CHECK-8M-NEXT: vlldm sp +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} ; CHECK-8M-NEXT: vmov s0, r12 ; CHECK-8M-NEXT: add sp, #136 ; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} @@ -1241,3 +1244,100 @@ entry: ret half %call } +define float @float_return_undef_arg(ptr nocapture %fptr) #6 { +; CHECK-8M-LABEL: float_return_undef_arg: +; CHECK-8M: @ %bb.0: @ %entry +; CHECK-8M-NEXT: push {r7, lr} +; CHECK-8M-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11} +; CHECK-8M-NEXT: bic r0, r0, #1 +; CHECK-8M-NEXT: sub sp, #136 +; CHECK-8M-NEXT: vmov.f32 s0, s0 +; CHECK-8M-NEXT: mov r1, r0 +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} +; CHECK-8M-NEXT: mov r2, r0 +; CHECK-8M-NEXT: mov r3, r0 +; CHECK-8M-NEXT: mov r4, r0 +; CHECK-8M-NEXT: mov r5, r0 +; CHECK-8M-NEXT: mov r6, r0 +; CHECK-8M-NEXT: mov r7, r0 +; CHECK-8M-NEXT: mov r8, r0 +; CHECK-8M-NEXT: mov r9, r0 +; CHECK-8M-NEXT: mov r10, r0 +; CHECK-8M-NEXT: mov r11, r0 +; CHECK-8M-NEXT: mov r12, r0 +; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 +; CHECK-8M-NEXT: blxns r0 +; CHECK-8M-NEXT: vmov r12, s0 +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} +; CHECK-8M-NEXT: vmov s0, r12 +; CHECK-8M-NEXT: add sp, #136 +; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} +; CHECK-8M-NEXT: pop {r7, pc} +; +; CHECK-81M-LABEL: float_return_undef_arg: +; CHECK-81M: @ %bb.0: @ %entry +; CHECK-81M-NEXT: push {r7, lr} +; CHECK-81M-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11} +; CHECK-81M-NEXT: bic r0, r0, #1 +; CHECK-81M-NEXT: vpush {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31} +; CHECK-81M-NEXT: vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, vpr} +; CHECK-81M-NEXT: vstr fpcxts, [sp, #-8]! +; CHECK-81M-NEXT: clrm {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, apsr} +; CHECK-81M-NEXT: blxns r0 +; CHECK-81M-NEXT: vldr fpcxts, [sp], #8 +; CHECK-81M-NEXT: vpop {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31} +; CHECK-81M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} +; CHECK-81M-NEXT: pop {r7, pc} +entry: + %call = call float %fptr(i32 undef) #7 + ret float %call +} + +define float @float_return_poison_arg(ptr nocapture %fptr) #6 { +; CHECK-8M-LABEL: float_return_poison_arg: +; CHECK-8M: @ %bb.0: @ %entry +; CHECK-8M-NEXT: push {r7, lr} +; CHECK-8M-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11} +; CHECK-8M-NEXT: bic r0, r0, #1 +; CHECK-8M-NEXT: sub sp, #136 +; CHECK-8M-NEXT: vmov.f32 s0, s0 +; CHECK-8M-NEXT: mov r1, r0 +; CHECK-8M-NEXT: vlstm sp, {d0 - d15} +; CHECK-8M-NEXT: mov r2, r0 +; CHECK-8M-NEXT: mov r3, r0 +; CHECK-8M-NEXT: mov r4, r0 +; CHECK-8M-NEXT: mov r5, r0 +; CHECK-8M-NEXT: mov r6, r0 +; CHECK-8M-NEXT: mov r7, r0 +; CHECK-8M-NEXT: mov r8, r0 +; CHECK-8M-NEXT: mov r9, r0 +; CHECK-8M-NEXT: mov r10, r0 +; CHECK-8M-NEXT: mov r11, r0 +; CHECK-8M-NEXT: mov r12, r0 +; CHECK-8M-NEXT: msr apsr_nzcvqg, r0 +; CHECK-8M-NEXT: blxns r0 +; CHECK-8M-NEXT: vmov r12, s0 +; CHECK-8M-NEXT: vlldm sp, {d0 - d15} +; CHECK-8M-NEXT: vmov s0, r12 +; CHECK-8M-NEXT: add sp, #136 +; CHECK-8M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} +; CHECK-8M-NEXT: pop {r7, pc} +; +; CHECK-81M-LABEL: float_return_poison_arg: +; CHECK-81M: @ %bb.0: @ %entry +; CHECK-81M-NEXT: push {r7, lr} +; CHECK-81M-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11} +; CHECK-81M-NEXT: bic r0, r0, #1 +; CHECK-81M-NEXT: vpush {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31} +; CHECK-81M-NEXT: vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, vpr} +; CHECK-81M-NEXT: vstr fpcxts, [sp, #-8]! +; CHECK-81M-NEXT: clrm {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, apsr} +; CHECK-81M-NEXT: blxns r0 +; CHECK-81M-NEXT: vldr fpcxts, [sp], #8 +; CHECK-81M-NEXT: vpop {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31} +; CHECK-81M-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11} +; CHECK-81M-NEXT: pop {r7, pc} +entry: + %call = call float %fptr(i32 poison) #7 + ret float %call +} diff --git a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir index 3d49fee8fdaf4..29429fd5a23eb 100644 --- a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir +++ b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir @@ -89,6 +89,7 @@ body: | # CHECK: $sp = t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, $r4, $r5, $r6, undef $r7, $r8, $r9, $r10, $r11 # CHECK-NEXT: $r0 = t2BICri $r0, 1, 14 /* CC::al */, $noreg, $noreg # CHECK-NEXT: $sp = tSUBspi $sp, 34, 14 /* CC::al */, $noreg +# CHECK-NEXT: dead $s0 = VMOVS undef $s0, 14 /* CC::al */, $noreg # CHECK-NEXT: VLSTM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $d0, implicit undef $d1, implicit undef $d2, implicit undef $d3, implicit undef $d4, implicit undef $d5, implicit undef $d6, implicit undef $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15 # CHECK-NEXT: $r1 = tMOVr $r0, 14 /* CC::al */, $noreg # CHECK-NEXT: $r2 = tMOVr $r0, 14 /* CC::al */, $noreg diff --git a/llvm/test/CodeGen/DirectX/group_memory_barrier_with_group_sync.ll b/llvm/test/CodeGen/DirectX/group_memory_barrier_with_group_sync.ll deleted file mode 100644 index baf93d4e177f0..0000000000000 --- a/llvm/test/CodeGen/DirectX/group_memory_barrier_with_group_sync.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s - -define void @test_group_memory_barrier_with_group_sync() { -entry: - ; CHECK: call void @dx.op.barrier(i32 80, i32 9) - call void @llvm.dx.group.memory.barrier.with.group.sync() - ret void -} \ No newline at end of file diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll index 06dfe00d90847..5c9575b2baab1 100644 --- a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll +++ b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll @@ -123,13 +123,12 @@ define i64 @caller_large_scalars() nounwind { ; CHECK-NEXT: addi.d $sp, $sp, -80 ; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill ; CHECK-NEXT: st.d $zero, $sp, 24 -; CHECK-NEXT: st.d $zero, $sp, 16 -; CHECK-NEXT: st.d $zero, $sp, 8 +; CHECK-NEXT: vrepli.b $vr0, 0 +; CHECK-NEXT: vst $vr0, $sp, 8 ; CHECK-NEXT: ori $a0, $zero, 2 ; CHECK-NEXT: st.d $a0, $sp, 0 ; CHECK-NEXT: st.d $zero, $sp, 56 -; CHECK-NEXT: st.d $zero, $sp, 48 -; CHECK-NEXT: st.d $zero, $sp, 40 +; CHECK-NEXT: vst $vr0, $sp, 40 ; CHECK-NEXT: ori $a2, $zero, 1 ; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: addi.d $a1, $sp, 0 @@ -182,14 +181,13 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind { ; CHECK-NEXT: ori $a0, $zero, 9 ; CHECK-NEXT: st.d $a0, $sp, 0 ; CHECK-NEXT: st.d $zero, $sp, 40 -; CHECK-NEXT: st.d $zero, $sp, 32 -; CHECK-NEXT: st.d $zero, $sp, 24 +; CHECK-NEXT: vrepli.b $vr0, 0 +; CHECK-NEXT: vst $vr0, $sp, 24 ; CHECK-NEXT: ori $a0, $zero, 10 ; CHECK-NEXT: st.d $a0, $sp, 16 ; CHECK-NEXT: st.d $zero, $sp, 72 -; CHECK-NEXT: st.d $zero, $sp, 64 -; CHECK-NEXT: st.d $zero, $sp, 56 -; CHECK-NEXT: ori $t0, $zero, 8 +; CHECK-NEXT: ori $a0, $zero, 8 +; CHECK-NEXT: st.d $a0, $sp, 48 ; CHECK-NEXT: ori $a0, $zero, 1 ; CHECK-NEXT: ori $a1, $zero, 2 ; CHECK-NEXT: ori $a2, $zero, 3 @@ -198,7 +196,7 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind { ; CHECK-NEXT: ori $a5, $zero, 6 ; CHECK-NEXT: ori $a6, $zero, 7 ; CHECK-NEXT: addi.d $a7, $sp, 48 -; CHECK-NEXT: st.d $t0, $sp, 48 +; CHECK-NEXT: vst $vr0, $sp, 56 ; CHECK-NEXT: bl %plt(callee_large_scalars_exhausted_regs) ; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload ; CHECK-NEXT: addi.d $sp, $sp, 96 diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll index 34fbec03c535b..35186b660c1e6 100644 --- a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll +++ b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll @@ -63,26 +63,17 @@ define i64 @caller_double_in_gpr_exhausted_fprs() nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi.d $sp, $sp, -16 ; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) -; CHECK-NEXT: fld.d $fa1, $a0, %pc_lo12(.LCPI3_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_1) -; CHECK-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI3_1) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_2) -; CHECK-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI3_2) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_3) -; CHECK-NEXT: fld.d $fa4, $a0, %pc_lo12(.LCPI3_3) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_4) -; CHECK-NEXT: fld.d $fa5, $a0, %pc_lo12(.LCPI3_4) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_5) -; CHECK-NEXT: fld.d $fa6, $a0, %pc_lo12(.LCPI3_5) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_6) -; CHECK-NEXT: fld.d $fa7, $a0, %pc_lo12(.LCPI3_6) -; CHECK-NEXT: addi.d $a0, $zero, 1 -; CHECK-NEXT: movgr2fr.d $fa0, $a0 -; CHECK-NEXT: ffint.d.l $fa0, $fa0 ; CHECK-NEXT: ori $a0, $zero, 0 ; CHECK-NEXT: lu32i.d $a0, 131072 ; CHECK-NEXT: lu52i.d $a0, $a0, 1026 +; CHECK-NEXT: vldi $vr0, -912 +; CHECK-NEXT: vldi $vr1, -1024 +; CHECK-NEXT: vldi $vr2, -1016 +; CHECK-NEXT: vldi $vr3, -1008 +; CHECK-NEXT: vldi $vr4, -1004 +; CHECK-NEXT: vldi $vr5, -1000 +; CHECK-NEXT: vldi $vr6, -996 +; CHECK-NEXT: vldi $vr7, -992 ; CHECK-NEXT: bl %plt(callee_double_in_gpr_exhausted_fprs) ; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; CHECK-NEXT: addi.d $sp, $sp, 16 @@ -98,9 +89,7 @@ define i64 @caller_double_in_gpr_exhausted_fprs() nounwind { define double @callee_double_ret() nounwind { ; CHECK-LABEL: callee_double_ret: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $a0, $zero, 1 -; CHECK-NEXT: movgr2fr.d $fa0, $a0 -; CHECK-NEXT: ffint.d.l $fa0, $fa0 +; CHECK-NEXT: vldi $vr0, -912 ; CHECK-NEXT: ret ret double 1.0 } diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll b/llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll index 558b9457239c1..a10d30c372f16 100644 --- a/llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll +++ b/llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc --mtriple=loongarch64 --target-abi=lp64s < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --target-abi=lp64s --mattr=-f < %s | FileCheck %s ;; This file contains specific tests for the lp64s ABI. diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll index 7bc7a982db86d..14bd0f4df4710 100644 --- a/llvm/test/CodeGen/LoongArch/code-models.ll +++ b/llvm/test/CodeGen/LoongArch/code-models.ll @@ -105,8 +105,8 @@ define i32 @caller_tail(i32 %i) nounwind { ; ; MEDIUM-LABEL: caller_tail: ; MEDIUM: # %bb.0: # %entry -; MEDIUM-NEXT: pcaddu18i $a1, %call36(callee_tail) -; MEDIUM-NEXT: jr $a1 +; MEDIUM-NEXT: pcaddu18i $t8, %call36(callee_tail) +; MEDIUM-NEXT: jr $t8 ; ; LARGE-LABEL: caller_tail: ; LARGE: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll index a26102710cbeb..161ed573c81f0 100644 --- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll @@ -175,16 +175,11 @@ define i8 @test_ctpop_i8(i8 %a) nounwind { ; ; LA64-LABEL: test_ctpop_i8: ; LA64: # %bb.0: -; LA64-NEXT: srli.d $a1, $a0, 1 -; LA64-NEXT: andi $a1, $a1, 85 -; LA64-NEXT: sub.d $a0, $a0, $a1 -; LA64-NEXT: andi $a1, $a0, 51 -; LA64-NEXT: srli.d $a0, $a0, 2 -; LA64-NEXT: andi $a0, $a0, 51 -; LA64-NEXT: add.d $a0, $a1, $a0 -; LA64-NEXT: srli.d $a1, $a0, 4 -; LA64-NEXT: add.d $a0, $a0, $a1 -; LA64-NEXT: andi $a0, $a0, 15 +; LA64-NEXT: andi $a0, $a0, 255 +; LA64-NEXT: vldi $vr0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vpcnt.d $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 ; LA64-NEXT: ret %1 = call i8 @llvm.ctpop.i8(i8 %a) ret i8 %1 @@ -213,22 +208,11 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { ; ; LA64-LABEL: test_ctpop_i16: ; LA64: # %bb.0: -; LA64-NEXT: srli.d $a1, $a0, 1 -; LA64-NEXT: lu12i.w $a2, 5 -; LA64-NEXT: ori $a2, $a2, 1365 -; LA64-NEXT: and $a1, $a1, $a2 -; LA64-NEXT: sub.d $a0, $a0, $a1 -; LA64-NEXT: lu12i.w $a1, 3 -; LA64-NEXT: ori $a1, $a1, 819 -; LA64-NEXT: and $a2, $a0, $a1 -; LA64-NEXT: srli.d $a0, $a0, 2 -; LA64-NEXT: and $a0, $a0, $a1 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: srli.d $a1, $a0, 4 -; LA64-NEXT: add.d $a0, $a0, $a1 -; LA64-NEXT: bstrpick.d $a1, $a0, 11, 8 -; LA64-NEXT: andi $a0, $a0, 15 -; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0 +; LA64-NEXT: vldi $vr0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vpcnt.d $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 ; LA64-NEXT: ret %1 = call i16 @llvm.ctpop.i16(i16 %a) ret i16 %1 @@ -261,26 +245,11 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; ; LA64-LABEL: test_ctpop_i32: ; LA64: # %bb.0: -; LA64-NEXT: srli.d $a1, $a0, 1 -; LA64-NEXT: lu12i.w $a2, 349525 -; LA64-NEXT: ori $a2, $a2, 1365 -; LA64-NEXT: and $a1, $a1, $a2 -; LA64-NEXT: sub.d $a0, $a0, $a1 -; LA64-NEXT: lu12i.w $a1, 209715 -; LA64-NEXT: ori $a1, $a1, 819 -; LA64-NEXT: and $a2, $a0, $a1 -; LA64-NEXT: srli.d $a0, $a0, 2 -; LA64-NEXT: and $a0, $a0, $a1 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: srli.d $a1, $a0, 4 -; LA64-NEXT: add.d $a0, $a0, $a1 -; LA64-NEXT: lu12i.w $a1, 61680 -; LA64-NEXT: ori $a1, $a1, 3855 -; LA64-NEXT: and $a0, $a0, $a1 -; LA64-NEXT: lu12i.w $a1, 4112 -; LA64-NEXT: ori $a1, $a1, 257 -; LA64-NEXT: mul.d $a0, $a0, $a1 -; LA64-NEXT: bstrpick.d $a0, $a0, 31, 24 +; LA64-NEXT: bstrpick.d $a0, $a0, 31, 0 +; LA64-NEXT: vldi $vr0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vpcnt.d $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 ; LA64-NEXT: ret %1 = call i32 @llvm.ctpop.i32(i32 %a) ret i32 %1 @@ -327,30 +296,10 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; ; LA64-LABEL: test_ctpop_i64: ; LA64: # %bb.0: -; LA64-NEXT: srli.d $a1, $a0, 1 -; LA64-NEXT: lu12i.w $a2, 349525 -; LA64-NEXT: ori $a2, $a2, 1365 -; LA64-NEXT: bstrins.d $a2, $a2, 62, 32 -; LA64-NEXT: and $a1, $a1, $a2 -; LA64-NEXT: sub.d $a0, $a0, $a1 -; LA64-NEXT: lu12i.w $a1, 209715 -; LA64-NEXT: ori $a1, $a1, 819 -; LA64-NEXT: bstrins.d $a1, $a1, 61, 32 -; LA64-NEXT: and $a2, $a0, $a1 -; LA64-NEXT: srli.d $a0, $a0, 2 -; LA64-NEXT: and $a0, $a0, $a1 -; LA64-NEXT: add.d $a0, $a2, $a0 -; LA64-NEXT: srli.d $a1, $a0, 4 -; LA64-NEXT: add.d $a0, $a0, $a1 -; LA64-NEXT: lu12i.w $a1, 61680 -; LA64-NEXT: ori $a1, $a1, 3855 -; LA64-NEXT: bstrins.d $a1, $a1, 59, 32 -; LA64-NEXT: and $a0, $a0, $a1 -; LA64-NEXT: lu12i.w $a1, 4112 -; LA64-NEXT: ori $a1, $a1, 257 -; LA64-NEXT: bstrins.d $a1, $a1, 56, 32 -; LA64-NEXT: mul.d $a0, $a0, $a1 -; LA64-NEXT: srli.d $a0, $a0, 56 +; LA64-NEXT: vldi $vr0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vpcnt.d $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 ; LA64-NEXT: ret %1 = call i64 @llvm.ctpop.i64(i64 %a) ret i64 %1 diff --git a/llvm/test/CodeGen/LoongArch/double-imm.ll b/llvm/test/CodeGen/LoongArch/double-imm.ll index 8d50b27907d72..fe403ec532d8e 100644 --- a/llvm/test/CodeGen/LoongArch/double-imm.ll +++ b/llvm/test/CodeGen/LoongArch/double-imm.ll @@ -59,9 +59,7 @@ define double @f64_add_fimm1(double %a) nounwind { ; ; LA64-LABEL: f64_add_fimm1: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $a0, $zero, 1 -; LA64-NEXT: movgr2fr.d $fa1, $a0 -; LA64-NEXT: ffint.d.l $fa1, $fa1 +; LA64-NEXT: vldi $vr1, -912 ; LA64-NEXT: fadd.d $fa0, $fa0, $fa1 ; LA64-NEXT: ret %1 = fadd double %a, 1.0 @@ -79,9 +77,7 @@ define double @f64_positive_fimm1() nounwind { ; ; LA64-LABEL: f64_positive_fimm1: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $a0, $zero, 1 -; LA64-NEXT: movgr2fr.d $fa0, $a0 -; LA64-NEXT: ffint.d.l $fa0, $fa0 +; LA64-NEXT: vldi $vr0, -912 ; LA64-NEXT: ret ret double 1.0 } diff --git a/llvm/test/CodeGen/LoongArch/expand-adjacency.ll b/llvm/test/CodeGen/LoongArch/expand-adjacency.ll new file mode 100644 index 0000000000000..154d2121a6321 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/expand-adjacency.ll @@ -0,0 +1,120 @@ +; RUN: llc --mtriple=loongarch64 --relocation-model=pic \ +; RUN: --code-model=medium < %s | FileCheck --check-prefix=MEDIUM %s +; RUN: llc --mtriple=loongarch64 --relocation-model=pic \ +; RUN: --code-model=large < %s | FileCheck --check-prefix=LARGE %s +; RUN: llc --mtriple=loongarch64 --relocation-model=pic \ +; RUN: --enable-tlsdesc --code-model=large < %s | \ +; RUN: FileCheck --check-prefix=LARGEDESC %s + +; Check the adjancency of pseudo-instruction expansions to ensure +; compliance with psABI requirements: +; https://github.com/loongson/la-abi-specs/releases/tag/v2.30 + +declare void @llvm.memset.p0.i64(ptr, i8, i64, i1) + +define void @call_external_sym(ptr %dst) { +; LARGE-LABEL: call_external_sym: +; LARGE: pcalau12i [[REG1:\$[a-z0-9]+]], %pc_hi20(memset) +; LARGE-NEXT: addi.d [[REG2:\$[a-z0-9]+]], $zero, %pc_lo12(memset) +; LARGE-NEXT: lu32i.d [[REG2]], %pc64_lo20(memset) +; LARGE-NEXT: lu52i.d [[REG2]], [[REG2]], %pc64_hi12(memset) +entry: + call void @llvm.memset.p0.i64(ptr %dst, i8 0, i64 1000, i1 false) + ret void +} + +declare i32 @callee_tail(i32 %i) + +define i32 @caller_call_tail(i32 %i) nounwind { +; MEDIUM-LABEL: caller_call_tail: +; MEDIUM: pcaddu18i $t8, %call36(callee_tail) +; MEDIUM-NEXT: jr $t8 +; +; LARGE-LABEL: caller_call_tail: +; LARGE: pcalau12i [[REG1:\$[a-z0-9]+]], %got_pc_hi20(callee_tail) +; LARGE-NEXT: addi.d [[REG2:\$[a-z0-9]+]], $zero, %got_pc_lo12(callee_tail) +; LARGE-NEXT: lu32i.d [[REG2]], %got64_pc_lo20(callee_tail) +; LARGE-NEXT: lu52i.d [[REG2]], [[REG2]], %got64_pc_hi12(callee_tail) +entry: + call i32 @callee_tail(i32 %i) + %r = tail call i32 @callee_tail(i32 %i) + ret i32 %r +} + +@ie = external thread_local(initialexec) global i32 + +define void @test_la_tls_ie(i32 signext %n) { +; LARGE-LABEL: test_la_tls_ie: +; LARGE: pcalau12i [[REG1:\$[a-z0-9]+]], %ie_pc_hi20(ie) +; LARGE-NEXT: addi.d [[REG2:\$[a-z0-9]+]], $zero, %ie_pc_lo12(ie) +; LARGE-NEXT: lu32i.d [[REG2]], %ie64_pc_lo20(ie) +; LARGE-NEXT: lu52i.d [[REG2]], [[REG2]], %ie64_pc_hi12(ie) +entry: + br label %loop + +loop: + %i = phi i32 [ %inc, %loop ], [ 0, %entry ] + %0 = load volatile i32, ptr @ie, align 4 + %inc = add nuw nsw i32 %i, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %loop, label %ret + +ret: + ret void +} + +@ld = external thread_local(localdynamic) global i32 + +define void @test_la_tls_ld(i32 signext %n) { +; LARGE-LABEL: test_la_tls_ld: +; LARGE: pcalau12i [[REG1:\$[a-z0-9]+]], %ld_pc_hi20(ld) +; LARGE-NEXT: addi.d [[REG2:\$[a-z0-9]+]], $zero, %got_pc_lo12(ld) +; LARGE-NEXT: lu32i.d [[REG2]], %got64_pc_lo20(ld) +; LARGE-NEXT: lu52i.d [[REG2]], [[REG2]], %got64_pc_hi12(ld) +entry: + br label %loop + +loop: + %i = phi i32 [ %inc, %loop ], [ 0, %entry ] + %0 = load volatile i32, ptr @ld, align 4 + %inc = add nuw nsw i32 %i, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %loop, label %ret + +ret: + ret void +} + +@gd = external thread_local global i32 + +define void @test_la_tls_gd(i32 signext %n) nounwind { +; LARGE-LABEL: test_la_tls_gd: +; LARGE: pcalau12i [[REG1:\$[a-z0-9]+]], %gd_pc_hi20(gd) +; LARGE-NEXT: addi.d [[REG2:\$[a-z0-9]+]], $zero, %got_pc_lo12(gd) +; LARGE-NEXT: lu32i.d [[REG2]], %got64_pc_lo20(gd) +; LARGE-NEXT: lu52i.d [[REG2]], [[REG2]], %got64_pc_hi12(gd) +entry: + br label %loop + +loop: + %i = phi i32 [ %inc, %loop ], [ 0, %entry ] + %0 = load volatile i32, ptr @gd, align 4 + %inc = add nuw nsw i32 %i, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %loop, label %ret + +ret: + ret void +} + +@unspecified = external thread_local global i32 + +define ptr @test_la_tls_desc() nounwind { +; LARGEDESC-LABEL: test_la_tls_desc: +; LARGEDESC: pcalau12i [[REG1:\$[a-z0-9]+]], %desc_pc_hi20(unspecified) +; LARGEDESC-NEXT: addi.d [[REG2:\$[a-z0-9]+]], $zero, %desc_pc_lo12(unspecified) +; LARGEDESC-NEXT: lu32i.d [[REG2]], %desc64_pc_lo20(unspecified) +; LARGEDESC-NEXT: lu52i.d [[REG2]], [[REG2]], %desc64_pc_hi12(unspecified) +entry: + ret ptr @unspecified +} diff --git a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll index 3f38bbed881a3..50f2d21a9cc84 100644 --- a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll +++ b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll @@ -66,14 +66,13 @@ define double @fdiv_d(double %x, double %y) { ; ; LA64D-FRECIPE-LABEL: fdiv_d: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; LA64D-FRECIPE-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) -; LA64D-FRECIPE-NEXT: frecipe.d $fa3, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa1, $fa3, $fa2 -; LA64D-FRECIPE-NEXT: fnmsub.d $fa2, $fa2, $fa3, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa0, $fa2 -; LA64D-FRECIPE-NEXT: fnmsub.d $fa0, $fa1, $fa3, $fa0 -; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa2, $fa0, $fa3 +; LA64D-FRECIPE-NEXT: frecipe.d $fa2, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr3, -784 +; LA64D-FRECIPE-NEXT: fmadd.d $fa3, $fa1, $fa2, $fa3 +; LA64D-FRECIPE-NEXT: fnmsub.d $fa2, $fa3, $fa2, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.d $fa0, $fa1, $fa3, $fa0 +; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa2, $fa0, $fa3 ; LA64D-FRECIPE-NEXT: ret %div = fdiv fast double %x, %y ret double %div diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll index ac5cb3c7e7211..cf15fd8bdb437 100644 --- a/llvm/test/CodeGen/LoongArch/frame.ll +++ b/llvm/test/CodeGen/LoongArch/frame.ll @@ -12,8 +12,8 @@ define i32 @test() nounwind { ; CHECK-NEXT: addi.d $sp, $sp, -32 ; CHECK-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill ; CHECK-NEXT: st.w $zero, $sp, 16 -; CHECK-NEXT: st.d $zero, $sp, 8 -; CHECK-NEXT: st.d $zero, $sp, 0 +; CHECK-NEXT: vrepli.b $vr0, 0 +; CHECK-NEXT: vst $vr0, $sp, 0 ; CHECK-NEXT: addi.d $a0, $sp, 4 ; CHECK-NEXT: bl %plt(test1) ; CHECK-NEXT: move $a0, $zero diff --git a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll index 388ae6321f664..5f14352fccd60 100644 --- a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll +++ b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll @@ -35,16 +35,14 @@ define float @frsqrt_f32(float %a) nounwind { ; ; LA64D-FRECIPE-LABEL: frsqrt_f32: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 -; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) -; LA64D-FRECIPE-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) -; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) -; LA64D-FRECIPE-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa1, $fa0 +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr2, -1144 +; LA64D-FRECIPE-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: vldi $vr2, -1056 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa1, $fa0 ; LA64D-FRECIPE-NEXT: ret %1 = call fast float @llvm.sqrt.f32(float %a) @@ -88,20 +86,18 @@ define double @frsqrt_f64(double %a) nounwind { ; ; LA64D-FRECIPE-LABEL: frsqrt_f64: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 -; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; LA64D-FRECIPE-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) -; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_1) -; LA64D-FRECIPE-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI1_1) -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa0, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa1, $fa0 +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr3, -888 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: vldi $vr4, -800 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa0, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa1, $fa0 ; LA64D-FRECIPE-NEXT: ret %1 = call fast double @llvm.sqrt.f64(double %a) %2 = fdiv fast double 1.0, %1 @@ -209,26 +205,24 @@ define double @sqrt_simplify_before_recip_3_uses_f64(double %x, ptr %p1, ptr %p2 ; ; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f64: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) -; LA64D-FRECIPE-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_1) -; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI2_1) -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_2) -; LA64D-FRECIPE-NEXT: fld.d $fa5, $a2, %pc_lo12(.LCPI2_2) -; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa5 -; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 -; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr3, -888 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: vldi $vr4, -800 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; LA64D-FRECIPE-NEXT: fld.d $fa5, $a2, %pc_lo12(.LCPI2_0) +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 ; LA64D-FRECIPE-NEXT: ret %sqrt = tail call fast double @llvm.sqrt.f64(double %x) %rsqrt = fdiv fast double 1.0, %sqrt @@ -342,29 +336,27 @@ define double @sqrt_simplify_before_recip_3_uses_order_f64(double %x, ptr %p1, p ; ; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) -; LA64D-FRECIPE-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_0) -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) -; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_1) -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_2) -; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_2) -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_3) -; LA64D-FRECIPE-NEXT: fld.d $fa4, $a2, %pc_lo12(.LCPI3_3) -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fst.d $fa2, $a0, 0 -; LA64D-FRECIPE-NEXT: fst.d $fa1, $a1, 0 +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr3, -888 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: vldi $vr4, -800 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; LA64D-FRECIPE-NEXT: fld.d $fa4, $a2, %pc_lo12(.LCPI3_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a1, 0 ; LA64D-FRECIPE-NEXT: ret %sqrt = tail call fast double @llvm.sqrt.f64(double %x) %sqrt_fast = fdiv fast double %x, %sqrt @@ -512,30 +504,28 @@ define double @sqrt_simplify_before_recip_4_uses_f64(double %x, ptr %p1, ptr %p2 ; ; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f64: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 -; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) -; LA64D-FRECIPE-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) -; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) -; LA64D-FRECIPE-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_2) -; LA64D-FRECIPE-NEXT: fld.d $fa4, $a3, %pc_lo12(.LCPI4_2) -; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_3) -; LA64D-FRECIPE-NEXT: fld.d $fa5, $a3, %pc_lo12(.LCPI4_3) -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa1, $fa5 -; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 -; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 -; LA64D-FRECIPE-NEXT: fst.d $fa3, $a2, 0 +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr3, -888 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: vldi $vr4, -800 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; LA64D-FRECIPE-NEXT: fld.d $fa5, $a3, %pc_lo12(.LCPI4_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa3, $a2, 0 ; LA64D-FRECIPE-NEXT: ret %sqrt = tail call fast double @llvm.sqrt.f64(double %x) %rsqrt = fdiv fast double 1.0, %sqrt @@ -595,22 +585,20 @@ define float @sqrt_simplify_before_recip_3_uses_f32(float %x, ptr %p1, ptr %p2) ; ; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f32: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) -; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI5_0) -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_1) -; LA64D-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI5_1) -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_2) -; LA64D-FRECIPE-NEXT: fld.s $fa5, $a2, %pc_lo12(.LCPI5_2) -; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa5 -; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fst.s $fa1, $a0, 0 -; LA64D-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr3, -1144 +; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI5_0) +; LA64D-FRECIPE-NEXT: vldi $vr4, -1056 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a1, 0 ; LA64D-FRECIPE-NEXT: ret ; %sqrt = tail call fast float @llvm.sqrt.f32(float %x) @@ -681,26 +669,24 @@ define float @sqrt_simplify_before_recip_4_uses_f32(float %x, ptr %p1, ptr %p2, ; ; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f32: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) -; LA64D-FRECIPE-NEXT: fld.s $fa1, $a3, %pc_lo12(.LCPI6_0) -; LA64D-FRECIPE-NEXT: frsqrte.s $fa2, $fa0 -; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa2 -; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 -; LA64D-FRECIPE-NEXT: fmadd.s $fa1, $fa3, $fa2, $fa1 -; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) -; LA64D-FRECIPE-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) -; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_2) -; LA64D-FRECIPE-NEXT: fld.s $fa4, $a3, %pc_lo12(.LCPI6_2) -; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_3) -; LA64D-FRECIPE-NEXT: fld.s $fa5, $a3, %pc_lo12(.LCPI6_3) -; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa2, $fa3 -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa2, $fa1 -; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa1, $fa5 -; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fst.s $fa1, $a0, 0 -; LA64D-FRECIPE-NEXT: fst.s $fa2, $a1, 0 -; LA64D-FRECIPE-NEXT: fst.s $fa3, $a2, 0 +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr3, -1144 +; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: vldi $vr3, -1056 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA64D-FRECIPE-NEXT: fld.s $fa4, $a3, %pc_lo12(.LCPI6_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA64D-FRECIPE-NEXT: fld.s $fa5, $a3, %pc_lo12(.LCPI6_1) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa3, $a2, 0 ; LA64D-FRECIPE-NEXT: ret ; %sqrt = tail call fast float @llvm.sqrt.f32(float %x) @@ -766,25 +752,23 @@ define float @sqrt_simplify_before_recip_3_uses_order_f32(float %x, ptr %p1, ptr ; ; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: ; LA64D-FRECIPE: # %bb.0: -; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) -; LA64D-FRECIPE-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_0) -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) -; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_1) -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.s $fa4, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa4, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_2) -; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_2) -; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_3) -; LA64D-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI7_3) -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 -; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 -; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa3 -; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 -; LA64D-FRECIPE-NEXT: fst.s $fa2, $a0, 0 -; LA64D-FRECIPE-NEXT: fst.s $fa1, $a1, 0 +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: vldi $vr3, -1144 +; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: vldi $vr3, -1056 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA64D-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI7_1) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a1, 0 ; LA64D-FRECIPE-NEXT: ret ; %sqrt = tail call fast float @llvm.sqrt.f32(float %x) diff --git a/llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll b/llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll index 6cf9d7d75b996..3d6e22b5eeb10 100644 --- a/llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll +++ b/llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll @@ -5,22 +5,9 @@ define void @getSetCCResultType(ptr %p) { ; CHECK-LABEL: getSetCCResultType: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.w $a1, $a0, 0 -; CHECK-NEXT: ld.w $a2, $a0, 12 -; CHECK-NEXT: ld.w $a3, $a0, 4 -; CHECK-NEXT: ld.w $a4, $a0, 8 -; CHECK-NEXT: sltui $a1, $a1, 1 -; CHECK-NEXT: sub.d $a1, $zero, $a1 -; CHECK-NEXT: sltui $a3, $a3, 1 -; CHECK-NEXT: sub.d $a3, $zero, $a3 -; CHECK-NEXT: sltui $a4, $a4, 1 -; CHECK-NEXT: sub.d $a4, $zero, $a4 -; CHECK-NEXT: sltui $a2, $a2, 1 -; CHECK-NEXT: sub.d $a2, $zero, $a2 -; CHECK-NEXT: st.w $a2, $a0, 12 -; CHECK-NEXT: st.w $a4, $a0, 8 -; CHECK-NEXT: st.w $a3, $a0, 4 -; CHECK-NEXT: st.w $a1, $a0, 0 +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vseqi.w $vr0, $vr0, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %0 = load <4 x i32>, ptr %p, align 16 diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-error.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-error.ll index 570fd438be97b..83f796f73934c 100644 --- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-error.ll +++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-error.ll @@ -1,4 +1,4 @@ -; RUN: not llc --mtriple=loongarch32 < %s 2>&1 | FileCheck %s +; RUN: not llc --mtriple=loongarch32 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LA32 ; RUN: not llc --mtriple=loongarch64 < %s 2>&1 | FileCheck %s define void @constraint_l() { @@ -32,9 +32,9 @@ define void @constraint_K() { } define void @constraint_f() nounwind { -; CHECK: error: couldn't allocate input reg for constraint 'f' +; LA32: error: couldn't allocate input reg for constraint 'f' tail call void asm "fadd.s $$fa0, $$fa0, $0", "f"(float 0.0) -; CHECK: error: couldn't allocate input reg for constraint 'f' +; LA32: error: couldn't allocate input reg for constraint 'f' tail call void asm "fadd.s $$fa0, $$fa0, $0", "f"(double 0.0) ret void } diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-error.ll b/llvm/test/CodeGen/LoongArch/intrinsic-error.ll index a839ab149c333..176e3f60c5625 100644 --- a/llvm/test/CodeGen/LoongArch/intrinsic-error.ll +++ b/llvm/test/CodeGen/LoongArch/intrinsic-error.ll @@ -1,4 +1,4 @@ -; RUN: not llc --mtriple=loongarch32 < %s 2>&1 | FileCheck %s +; RUN: not llc --mtriple=loongarch32 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LA32 ; RUN: not llc --mtriple=loongarch64 < %s 2>&1 | FileCheck %s declare void @llvm.loongarch.dbar(i32) @@ -54,7 +54,7 @@ entry: } define void @movgr2fcsr(i32 %a) nounwind { -; CHECK: llvm.loongarch.movgr2fcsr: requires basic 'f' target feature. +; LA32: llvm.loongarch.movgr2fcsr: requires basic 'f' target feature. entry: call void @llvm.loongarch.movgr2fcsr(i32 1, i32 %a) ret void @@ -75,7 +75,7 @@ entry: } define i32 @movfcsr2gr() nounwind { -; CHECK: llvm.loongarch.movfcsr2gr: requires basic 'f' target feature. +; LA32: llvm.loongarch.movfcsr2gr: requires basic 'f' target feature. entry: %res = call i32 @llvm.loongarch.movfcsr2gr(i32 1) ret i32 %res diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll index 622001db32955..402ddb9ad941b 100644 --- a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll +++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll @@ -12,18 +12,12 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 der ; CHECK-NEXT: alsl.d $a1, $a1, $a2, 4 ; CHECK-NEXT: addi.d $a2, $sp, 0 ; CHECK-NEXT: add.d $a3, $a2, $a1 -; CHECK-NEXT: ldx.d $a1, $a1, $a2 -; CHECK-NEXT: ld.d $a2, $a3, 40 -; CHECK-NEXT: st.d $a1, $a0, 0 -; CHECK-NEXT: st.d $a2, $a0, 40 -; CHECK-NEXT: ld.d $a1, $a3, 32 -; CHECK-NEXT: ld.d $a2, $a3, 24 -; CHECK-NEXT: ld.d $a4, $a3, 16 -; CHECK-NEXT: ld.d $a3, $a3, 8 -; CHECK-NEXT: st.d $a1, $a0, 32 -; CHECK-NEXT: st.d $a2, $a0, 24 -; CHECK-NEXT: st.d $a4, $a0, 16 -; CHECK-NEXT: st.d $a3, $a0, 8 +; CHECK-NEXT: vldx $vr0, $a1, $a2 +; CHECK-NEXT: vld $vr1, $a3, 32 +; CHECK-NEXT: vld $vr2, $a3, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: vst $vr1, $a0, 32 +; CHECK-NEXT: vst $vr2, $a0, 16 ; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %1 = alloca [2 x %Box], align 16 diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll index 7e320d9245f1c..6ea658acdd717 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll @@ -40,9 +40,7 @@ define float @float_fadd_acquire(ptr %p) nounwind { ; LA64D-LABEL: float_fadd_acquire: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB0_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -111,8 +109,7 @@ define float @float_fsub_acquire(ptr %p) nounwind { ; LA64D-LABEL: float_fsub_acquire: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) -; LA64D-NEXT: fld.s $fa1, $a1, %pc_lo12(.LCPI1_0) +; LA64D-NEXT: vldi $vr1, -1040 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB1_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -183,9 +180,7 @@ define float @float_fmin_acquire(ptr %p) nounwind { ; LA64D-LABEL: float_fmin_acquire: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB2_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -257,9 +252,7 @@ define float @float_fmax_acquire(ptr %p) nounwind { ; LA64D-LABEL: float_fmax_acquire: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB3_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -331,35 +324,31 @@ define double @double_fadd_acquire(ptr %p) nounwind { ; ; LA64D-LABEL: double_fadd_acquire: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB4_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -912 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 2 ; LA64D-NEXT: ori $a5, $zero, 2 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB4_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, double 1.0 acquire, align 4 ret double %v @@ -404,34 +393,31 @@ define double @double_fsub_acquire(ptr %p) nounwind { ; ; LA64D-LABEL: double_fsub_acquire: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0) -; LA64D-NEXT: fld.d $fs0, $a0, %pc_lo12(.LCPI5_0) ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB5_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -784 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 2 ; LA64D-NEXT: ori $a5, $zero, 2 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB5_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, double 1.0 acquire, align 4 ret double %v @@ -476,36 +462,32 @@ define double @double_fmin_acquire(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmin_acquire: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB6_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmin.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmin.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 2 ; LA64D-NEXT: ori $a5, $zero, 2 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB6_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, double 1.0 acquire, align 4 ret double %v @@ -550,36 +532,32 @@ define double @double_fmax_acquire(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmax_acquire: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB7_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmax.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 2 ; LA64D-NEXT: ori $a5, $zero, 2 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB7_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, double 1.0 acquire, align 4 ret double %v @@ -623,9 +601,7 @@ define float @float_fadd_release(ptr %p) nounwind { ; LA64D-LABEL: float_fadd_release: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB8_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -694,8 +670,7 @@ define float @float_fsub_release(ptr %p) nounwind { ; LA64D-LABEL: float_fsub_release: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a1, %pc_hi20(.LCPI9_0) -; LA64D-NEXT: fld.s $fa1, $a1, %pc_lo12(.LCPI9_0) +; LA64D-NEXT: vldi $vr1, -1040 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB9_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -766,9 +741,7 @@ define float @float_fmin_release(ptr %p) nounwind { ; LA64D-LABEL: float_fmin_release: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB10_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -840,9 +813,7 @@ define float @float_fmax_release(ptr %p) nounwind { ; LA64D-LABEL: float_fmax_release: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB11_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -914,35 +885,31 @@ define double @double_fadd_release(ptr %p) nounwind { ; ; LA64D-LABEL: double_fadd_release: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB12_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -912 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 3 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: move $a5, $zero ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB12_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, double 1.0 release, align 4 ret double %v @@ -987,34 +954,31 @@ define double @double_fsub_release(ptr %p) nounwind { ; ; LA64D-LABEL: double_fsub_release: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI13_0) -; LA64D-NEXT: fld.d $fs0, $a0, %pc_lo12(.LCPI13_0) ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB13_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -784 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 3 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: move $a5, $zero ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB13_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, double 1.0 release, align 4 ret double %v @@ -1059,36 +1023,32 @@ define double @double_fmin_release(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmin_release: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB14_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmin.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmin.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 3 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: move $a5, $zero ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB14_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, double 1.0 release, align 4 ret double %v @@ -1133,36 +1093,32 @@ define double @double_fmax_release(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmax_release: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB15_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmax.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 3 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: move $a5, $zero ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB15_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, double 1.0 release, align 4 ret double %v @@ -1206,9 +1162,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind { ; LA64D-LABEL: float_fadd_acq_rel: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB16_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -1277,8 +1231,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind { ; LA64D-LABEL: float_fsub_acq_rel: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a1, %pc_hi20(.LCPI17_0) -; LA64D-NEXT: fld.s $fa1, $a1, %pc_lo12(.LCPI17_0) +; LA64D-NEXT: vldi $vr1, -1040 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB17_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -1349,9 +1302,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind { ; LA64D-LABEL: float_fmin_acq_rel: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB18_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -1423,9 +1374,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind { ; LA64D-LABEL: float_fmax_acq_rel: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB19_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -1497,35 +1446,31 @@ define double @double_fadd_acq_rel(ptr %p) nounwind { ; ; LA64D-LABEL: double_fadd_acq_rel: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB20_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -912 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 4 ; LA64D-NEXT: ori $a5, $zero, 2 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB20_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, double 1.0 acq_rel, align 4 ret double %v @@ -1570,34 +1515,31 @@ define double @double_fsub_acq_rel(ptr %p) nounwind { ; ; LA64D-LABEL: double_fsub_acq_rel: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI21_0) -; LA64D-NEXT: fld.d $fs0, $a0, %pc_lo12(.LCPI21_0) ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB21_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -784 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 4 ; LA64D-NEXT: ori $a5, $zero, 2 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB21_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, double 1.0 acq_rel, align 4 ret double %v @@ -1642,36 +1584,32 @@ define double @double_fmin_acq_rel(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmin_acq_rel: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB22_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmin.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmin.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 4 ; LA64D-NEXT: ori $a5, $zero, 2 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB22_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, double 1.0 acq_rel, align 4 ret double %v @@ -1716,36 +1654,32 @@ define double @double_fmax_acq_rel(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmax_acq_rel: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB23_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmax.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 4 ; LA64D-NEXT: ori $a5, $zero, 2 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB23_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, double 1.0 acq_rel, align 4 ret double %v @@ -1789,9 +1723,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind { ; LA64D-LABEL: float_fadd_seq_cst: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB24_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -1860,8 +1792,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind { ; LA64D-LABEL: float_fsub_seq_cst: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a1, %pc_hi20(.LCPI25_0) -; LA64D-NEXT: fld.s $fa1, $a1, %pc_lo12(.LCPI25_0) +; LA64D-NEXT: vldi $vr1, -1040 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB25_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -1932,9 +1863,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind { ; LA64D-LABEL: float_fmin_seq_cst: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB26_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -2006,9 +1935,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind { ; LA64D-LABEL: float_fmax_seq_cst: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB27_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -2080,35 +2007,31 @@ define double @double_fadd_seq_cst(ptr %p) nounwind { ; ; LA64D-LABEL: double_fadd_seq_cst: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB28_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -912 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 5 ; LA64D-NEXT: ori $a5, $zero, 5 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB28_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, double 1.0 seq_cst, align 4 ret double %v @@ -2153,34 +2076,31 @@ define double @double_fsub_seq_cst(ptr %p) nounwind { ; ; LA64D-LABEL: double_fsub_seq_cst: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI29_0) -; LA64D-NEXT: fld.d $fs0, $a0, %pc_lo12(.LCPI29_0) ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB29_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -784 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 5 ; LA64D-NEXT: ori $a5, $zero, 5 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB29_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, double 1.0 seq_cst, align 4 ret double %v @@ -2225,36 +2145,32 @@ define double @double_fmin_seq_cst(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmin_seq_cst: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB30_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmin.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmin.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 5 ; LA64D-NEXT: ori $a5, $zero, 5 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB30_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, double 1.0 seq_cst, align 4 ret double %v @@ -2299,36 +2215,32 @@ define double @double_fmax_seq_cst(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmax_seq_cst: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB31_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmax.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: ori $a4, $zero, 5 ; LA64D-NEXT: ori $a5, $zero, 5 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB31_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, double 1.0 seq_cst, align 4 ret double %v @@ -2372,9 +2284,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind { ; LA64D-LABEL: float_fadd_monotonic: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB32_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -2443,8 +2353,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind { ; LA64D-LABEL: float_fsub_monotonic: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a1, %pc_hi20(.LCPI33_0) -; LA64D-NEXT: fld.s $fa1, $a1, %pc_lo12(.LCPI33_0) +; LA64D-NEXT: vldi $vr1, -1040 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB33_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -2515,9 +2424,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind { ; LA64D-LABEL: float_fmin_monotonic: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB34_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -2589,9 +2496,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind { ; LA64D-LABEL: float_fmax_monotonic: ; LA64D: # %bb.0: ; LA64D-NEXT: fld.s $fa0, $a0, 0 -; LA64D-NEXT: addi.w $a1, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-NEXT: ffint.s.w $fa1, $fa1 +; LA64D-NEXT: vldi $vr1, -1168 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB35_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Loop Header: Depth=1 @@ -2663,35 +2568,31 @@ define double @double_fadd_monotonic(ptr %p) nounwind { ; ; LA64D-LABEL: double_fadd_monotonic: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB36_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -912 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: move $a4, $zero ; LA64D-NEXT: move $a5, $zero ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB36_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fadd ptr %p, double 1.0 monotonic, align 4 ret double %v @@ -2736,34 +2637,31 @@ define double @double_fsub_monotonic(ptr %p) nounwind { ; ; LA64D-LABEL: double_fsub_monotonic: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI37_0) -; LA64D-NEXT: fld.d $fs0, $a0, %pc_lo12(.LCPI37_0) ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB37_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64D-NEXT: fadd.d $fa1, $fa0, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr1, -784 +; LA64D-NEXT: fadd.d $fa1, $fa0, $fa1 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: move $a4, $zero ; LA64D-NEXT: move $a5, $zero ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB37_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fsub ptr %p, double 1.0 monotonic, align 4 ret double %v @@ -2808,36 +2706,32 @@ define double @double_fmin_monotonic(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmin_monotonic: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB38_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmin.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmin.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: move $a4, $zero ; LA64D-NEXT: move $a5, $zero ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB38_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmin ptr %p, double 1.0 monotonic, align 4 ret double %v @@ -2882,36 +2776,32 @@ define double @double_fmax_monotonic(ptr %p) nounwind { ; ; LA64D-LABEL: double_fmax_monotonic: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $sp, $sp, -48 -; LA64D-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; LA64D-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; LA64D-NEXT: fst.d $fs0, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: addi.d $sp, $sp, -32 +; LA64D-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64D-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill ; LA64D-NEXT: move $fp, $a0 ; LA64D-NEXT: fld.d $fa0, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa1, $a0 -; LA64D-NEXT: ffint.d.l $fs0, $fa1 ; LA64D-NEXT: .p2align 4, , 16 ; LA64D-NEXT: .LBB39_1: # %atomicrmw.start ; LA64D-NEXT: # =>This Inner Loop Header: Depth=1 ; LA64D-NEXT: fmax.d $fa1, $fa0, $fa0 -; LA64D-NEXT: fmax.d $fa1, $fa1, $fs0 -; LA64D-NEXT: fst.d $fa0, $sp, 16 -; LA64D-NEXT: fst.d $fa1, $sp, 8 +; LA64D-NEXT: vldi $vr2, -912 +; LA64D-NEXT: fmax.d $fa1, $fa1, $fa2 +; LA64D-NEXT: fst.d $fa0, $sp, 8 +; LA64D-NEXT: fst.d $fa1, $sp, 0 ; LA64D-NEXT: ori $a0, $zero, 8 -; LA64D-NEXT: addi.d $a2, $sp, 16 -; LA64D-NEXT: addi.d $a3, $sp, 8 +; LA64D-NEXT: addi.d $a2, $sp, 8 +; LA64D-NEXT: addi.d $a3, $sp, 0 ; LA64D-NEXT: move $a1, $fp ; LA64D-NEXT: move $a4, $zero ; LA64D-NEXT: move $a5, $zero ; LA64D-NEXT: bl %plt(__atomic_compare_exchange) -; LA64D-NEXT: fld.d $fa0, $sp, 16 +; LA64D-NEXT: fld.d $fa0, $sp, 8 ; LA64D-NEXT: beqz $a0, .LBB39_1 ; LA64D-NEXT: # %bb.2: # %atomicrmw.end -; LA64D-NEXT: fld.d $fs0, $sp, 24 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; LA64D-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; LA64D-NEXT: addi.d $sp, $sp, 48 +; LA64D-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64D-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64D-NEXT: addi.d $sp, $sp, 32 ; LA64D-NEXT: ret %v = atomicrmw fmax ptr %p, double 1.0 monotonic, align 4 ret double %v diff --git a/llvm/test/CodeGen/LoongArch/lasx/bswap.ll b/llvm/test/CodeGen/LoongArch/lasx/bswap.ll new file mode 100644 index 0000000000000..1b0132d25ed59 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/bswap.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @bswap_v16i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: bswap_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 177 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <16 x i16>, ptr %src + %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v) + store <16 x i16> %res, ptr %dst + ret void +} + +define void @bswap_v8i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: bswap_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 27 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i32>, ptr %src + %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v) + store <8 x i32> %res, ptr %dst + ret void +} + +define void @bswap_v4i64(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: bswap_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 27 +; CHECK-NEXT: xvshuf4i.w $xr0, $xr0, 177 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i64>, ptr %src + %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v) + store <4 x i64> %res, ptr %dst + ret void +} + +declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>) +declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) +declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/bswap.ll b/llvm/test/CodeGen/LoongArch/lsx/bswap.ll new file mode 100644 index 0000000000000..8172e21eae34d --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/bswap.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @bswap_v8i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: bswap_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 177 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i16>, ptr %src + %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v) + store <8 x i16> %res, ptr %dst + ret void +} + +define void @bswap_v4i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: bswap_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 27 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i32>, ptr %src + %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v) + store <4 x i32> %res, ptr %dst + ret void +} + +define void @bswap_v2i64(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: bswap_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 27 +; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 177 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <2 x i64>, ptr %src + %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v) + store <2 x i64> %res, ptr %dst + ret void +} + +declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) +declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll index 7500b5ae09359..96853105049b4 100644 --- a/llvm/test/CodeGen/LoongArch/sextw-removal.ll +++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll @@ -142,90 +142,44 @@ define signext i32 @test4(ptr %p, i32 signext %b) nounwind { define void @test5(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: addi.d $sp, $sp, -48 -; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; CHECK-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill -; CHECK-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill -; CHECK-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; CHECK-NEXT: sra.w $a1, $a0, $a1 -; CHECK-NEXT: lu12i.w $a0, 349525 -; CHECK-NEXT: ori $fp, $a0, 1365 -; CHECK-NEXT: lu12i.w $a0, 209715 -; CHECK-NEXT: ori $s0, $a0, 819 -; CHECK-NEXT: lu12i.w $a0, 61680 -; CHECK-NEXT: ori $s1, $a0, 3855 -; CHECK-NEXT: lu12i.w $a0, 4112 -; CHECK-NEXT: ori $s2, $a0, 257 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB4_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: addi.w $a0, $a1, 0 ; CHECK-NEXT: bl %plt(bar) -; CHECK-NEXT: srli.d $a1, $a0, 1 -; CHECK-NEXT: and $a1, $a1, $fp -; CHECK-NEXT: sub.d $a1, $a0, $a1 -; CHECK-NEXT: and $a2, $a1, $s0 -; CHECK-NEXT: srli.d $a1, $a1, 2 -; CHECK-NEXT: and $a1, $a1, $s0 -; CHECK-NEXT: add.d $a1, $a2, $a1 -; CHECK-NEXT: srli.d $a2, $a1, 4 -; CHECK-NEXT: add.d $a1, $a1, $a2 -; CHECK-NEXT: and $a1, $a1, $s1 -; CHECK-NEXT: mul.d $a1, $a1, $s2 -; CHECK-NEXT: bstrpick.d $a1, $a1, 31, 24 +; CHECK-NEXT: bstrpick.d $a1, $a0, 31, 0 +; CHECK-NEXT: vldi $vr0, 0 +; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 0 +; CHECK-NEXT: vpcnt.d $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.d $a1, $vr0, 0 ; CHECK-NEXT: bnez $a0, .LBB4_1 ; CHECK-NEXT: # %bb.2: # %bb7 -; CHECK-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 48 +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 ; CHECK-NEXT: ret ; ; NORMV-LABEL: test5: ; NORMV: # %bb.0: # %bb -; NORMV-NEXT: addi.d $sp, $sp, -48 -; NORMV-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; NORMV-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; NORMV-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill -; NORMV-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill -; NORMV-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill +; NORMV-NEXT: addi.d $sp, $sp, -16 +; NORMV-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; NORMV-NEXT: sra.w $a1, $a0, $a1 -; NORMV-NEXT: lu12i.w $a0, 349525 -; NORMV-NEXT: ori $fp, $a0, 1365 -; NORMV-NEXT: lu12i.w $a0, 209715 -; NORMV-NEXT: ori $s0, $a0, 819 -; NORMV-NEXT: lu12i.w $a0, 61680 -; NORMV-NEXT: ori $s1, $a0, 3855 -; NORMV-NEXT: lu12i.w $a0, 4112 -; NORMV-NEXT: ori $s2, $a0, 257 ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB4_1: # %bb2 ; NORMV-NEXT: # =>This Inner Loop Header: Depth=1 ; NORMV-NEXT: addi.w $a0, $a1, 0 ; NORMV-NEXT: bl %plt(bar) -; NORMV-NEXT: srli.d $a1, $a0, 1 -; NORMV-NEXT: and $a1, $a1, $fp -; NORMV-NEXT: sub.d $a1, $a0, $a1 -; NORMV-NEXT: and $a2, $a1, $s0 -; NORMV-NEXT: srli.d $a1, $a1, 2 -; NORMV-NEXT: and $a1, $a1, $s0 -; NORMV-NEXT: add.d $a1, $a2, $a1 -; NORMV-NEXT: srli.d $a2, $a1, 4 -; NORMV-NEXT: add.d $a1, $a1, $a2 -; NORMV-NEXT: and $a1, $a1, $s1 -; NORMV-NEXT: mul.d $a1, $a1, $s2 -; NORMV-NEXT: bstrpick.d $a1, $a1, 31, 24 +; NORMV-NEXT: bstrpick.d $a1, $a0, 31, 0 +; NORMV-NEXT: vldi $vr0, 0 +; NORMV-NEXT: vinsgr2vr.d $vr0, $a1, 0 +; NORMV-NEXT: vpcnt.d $vr0, $vr0 +; NORMV-NEXT: vpickve2gr.d $a1, $vr0, 0 ; NORMV-NEXT: bnez $a0, .LBB4_1 ; NORMV-NEXT: # %bb.2: # %bb7 -; NORMV-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; NORMV-NEXT: addi.d $sp, $sp, 48 +; NORMV-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; NORMV-NEXT: addi.d $sp, $sp, 16 ; NORMV-NEXT: ret bb: %i = ashr i32 %arg, %arg1 @@ -247,54 +201,45 @@ declare i32 @llvm.ctpop.i32(i32) define void @test6(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-LABEL: test6: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: addi.d $sp, $sp, -32 -; CHECK-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill -; CHECK-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: sra.w $fp, $a0, $a1 +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill +; CHECK-NEXT: sra.w $a0, $a0, $a1 +; CHECK-NEXT: movgr2fr.w $fs0, $zero ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB5_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: addi.w $a0, $fp, 0 ; CHECK-NEXT: bl %plt(baz) -; CHECK-NEXT: move $s0, $a0 -; CHECK-NEXT: bl %plt(__fixsfsi) -; CHECK-NEXT: move $fp, $a0 -; CHECK-NEXT: move $a0, $s0 -; CHECK-NEXT: move $a1, $zero -; CHECK-NEXT: bl %plt(__nesf2) -; CHECK-NEXT: bnez $a0, .LBB5_1 +; CHECK-NEXT: ftintrz.w.s $fa1, $fa0 +; CHECK-NEXT: fcmp.cune.s $fcc0, $fa0, $fs0 +; CHECK-NEXT: movfr2gr.s $a0, $fa1 +; CHECK-NEXT: bcnez $fcc0, .LBB5_1 ; CHECK-NEXT: # %bb.2: # %bb7 -; CHECK-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 32 +; CHECK-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 ; CHECK-NEXT: ret ; ; NORMV-LABEL: test6: ; NORMV: # %bb.0: # %bb -; NORMV-NEXT: addi.d $sp, $sp, -32 -; NORMV-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill -; NORMV-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill -; NORMV-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill -; NORMV-NEXT: sra.w $fp, $a0, $a1 +; NORMV-NEXT: addi.d $sp, $sp, -16 +; NORMV-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; NORMV-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill +; NORMV-NEXT: sra.w $a0, $a0, $a1 +; NORMV-NEXT: movgr2fr.w $fs0, $zero ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB5_1: # %bb2 ; NORMV-NEXT: # =>This Inner Loop Header: Depth=1 -; NORMV-NEXT: addi.w $a0, $fp, 0 +; NORMV-NEXT: addi.w $a0, $a0, 0 ; NORMV-NEXT: bl %plt(baz) -; NORMV-NEXT: move $s0, $a0 -; NORMV-NEXT: bl %plt(__fixsfsi) -; NORMV-NEXT: move $fp, $a0 -; NORMV-NEXT: move $a0, $s0 -; NORMV-NEXT: move $a1, $zero -; NORMV-NEXT: bl %plt(__nesf2) -; NORMV-NEXT: bnez $a0, .LBB5_1 +; NORMV-NEXT: ftintrz.w.s $fa1, $fa0 +; NORMV-NEXT: fcmp.cune.s $fcc0, $fa0, $fs0 +; NORMV-NEXT: movfr2gr.s $a0, $fa1 +; NORMV-NEXT: bcnez $fcc0, .LBB5_1 ; NORMV-NEXT: # %bb.2: # %bb7 -; NORMV-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload -; NORMV-NEXT: addi.d $sp, $sp, 32 +; NORMV-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload +; NORMV-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; NORMV-NEXT: addi.d $sp, $sp, 16 ; NORMV-NEXT: ret bb: %i = ashr i32 %arg, %arg1 @@ -315,97 +260,42 @@ declare float @baz(i32 signext %i3) define void @test7(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK-LABEL: test7: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: addi.d $sp, $sp, -48 -; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; CHECK-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill -; CHECK-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill -; CHECK-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; CHECK-NEXT: sra.w $a0, $a0, $a1 -; CHECK-NEXT: lu12i.w $a1, 349525 -; CHECK-NEXT: ori $fp, $a1, 1365 -; CHECK-NEXT: bstrins.d $fp, $fp, 62, 32 -; CHECK-NEXT: lu12i.w $a1, 209715 -; CHECK-NEXT: ori $s0, $a1, 819 -; CHECK-NEXT: bstrins.d $s0, $s0, 61, 32 -; CHECK-NEXT: lu12i.w $a1, 61680 -; CHECK-NEXT: ori $s1, $a1, 3855 -; CHECK-NEXT: bstrins.d $s1, $s1, 59, 32 -; CHECK-NEXT: lu12i.w $a1, 4112 -; CHECK-NEXT: ori $s2, $a1, 257 -; CHECK-NEXT: bstrins.d $s2, $s2, 56, 32 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB6_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: addi.w $a0, $a0, 0 ; CHECK-NEXT: bl %plt(foo) -; CHECK-NEXT: srli.d $a1, $a0, 1 -; CHECK-NEXT: and $a1, $a1, $fp -; CHECK-NEXT: sub.d $a0, $a0, $a1 -; CHECK-NEXT: and $a1, $a0, $s0 -; CHECK-NEXT: srli.d $a0, $a0, 2 -; CHECK-NEXT: and $a0, $a0, $s0 -; CHECK-NEXT: add.d $a0, $a1, $a0 -; CHECK-NEXT: srli.d $a1, $a0, 4 -; CHECK-NEXT: add.d $a0, $a0, $a1 -; CHECK-NEXT: and $a0, $a0, $s1 -; CHECK-NEXT: mul.d $a0, $a0, $s2 -; CHECK-NEXT: srli.d $a0, $a0, 56 +; CHECK-NEXT: vldi $vr0, 0 +; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; CHECK-NEXT: vpcnt.d $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 ; CHECK-NEXT: bnez $a0, .LBB6_1 ; CHECK-NEXT: # %bb.2: # %bb7 -; CHECK-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 48 +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 ; CHECK-NEXT: ret ; ; NORMV-LABEL: test7: ; NORMV: # %bb.0: # %bb -; NORMV-NEXT: addi.d $sp, $sp, -48 -; NORMV-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill -; NORMV-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; NORMV-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill -; NORMV-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill -; NORMV-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill +; NORMV-NEXT: addi.d $sp, $sp, -16 +; NORMV-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; NORMV-NEXT: sra.w $a0, $a0, $a1 -; NORMV-NEXT: lu12i.w $a1, 349525 -; NORMV-NEXT: ori $fp, $a1, 1365 -; NORMV-NEXT: bstrins.d $fp, $fp, 62, 32 -; NORMV-NEXT: lu12i.w $a1, 209715 -; NORMV-NEXT: ori $s0, $a1, 819 -; NORMV-NEXT: bstrins.d $s0, $s0, 61, 32 -; NORMV-NEXT: lu12i.w $a1, 61680 -; NORMV-NEXT: ori $s1, $a1, 3855 -; NORMV-NEXT: bstrins.d $s1, $s1, 59, 32 -; NORMV-NEXT: lu12i.w $a1, 4112 -; NORMV-NEXT: ori $s2, $a1, 257 -; NORMV-NEXT: bstrins.d $s2, $s2, 56, 32 ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB6_1: # %bb2 ; NORMV-NEXT: # =>This Inner Loop Header: Depth=1 ; NORMV-NEXT: addi.w $a0, $a0, 0 ; NORMV-NEXT: bl %plt(foo) -; NORMV-NEXT: srli.d $a1, $a0, 1 -; NORMV-NEXT: and $a1, $a1, $fp -; NORMV-NEXT: sub.d $a0, $a0, $a1 -; NORMV-NEXT: and $a1, $a0, $s0 -; NORMV-NEXT: srli.d $a0, $a0, 2 -; NORMV-NEXT: and $a0, $a0, $s0 -; NORMV-NEXT: add.d $a0, $a1, $a0 -; NORMV-NEXT: srli.d $a1, $a0, 4 -; NORMV-NEXT: add.d $a0, $a0, $a1 -; NORMV-NEXT: and $a0, $a0, $s1 -; NORMV-NEXT: mul.d $a0, $a0, $s2 -; NORMV-NEXT: srli.d $a0, $a0, 56 +; NORMV-NEXT: vldi $vr0, 0 +; NORMV-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; NORMV-NEXT: vpcnt.d $vr0, $vr0 +; NORMV-NEXT: vpickve2gr.d $a0, $vr0, 0 ; NORMV-NEXT: bnez $a0, .LBB6_1 ; NORMV-NEXT: # %bb.2: # %bb7 -; NORMV-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload -; NORMV-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload -; NORMV-NEXT: addi.d $sp, $sp, 48 +; NORMV-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; NORMV-NEXT: addi.d $sp, $sp, 16 ; NORMV-NEXT: ret bb: %i = ashr i32 %arg, %arg1 @@ -544,19 +434,18 @@ define void @test10(i32 signext %arg, i32 signext %arg1) nounwind { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: addi.d $sp, $sp, -16 ; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill -; CHECK-NEXT: sra.w $fp, $a0, $a1 +; CHECK-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill +; CHECK-NEXT: sra.w $a0, $a0, $a1 +; CHECK-NEXT: movgr2fr.w $fs0, $zero ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB9_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: addi.w $a0, $fp, 0 ; CHECK-NEXT: bl %plt(baz) -; CHECK-NEXT: move $fp, $a0 -; CHECK-NEXT: move $a1, $zero -; CHECK-NEXT: bl %plt(__nesf2) -; CHECK-NEXT: bnez $a0, .LBB9_1 +; CHECK-NEXT: fcmp.cune.s $fcc0, $fa0, $fs0 +; CHECK-NEXT: movfr2gr.s $a0, $fa0 +; CHECK-NEXT: bcnez $fcc0, .LBB9_1 ; CHECK-NEXT: # %bb.2: # %bb7 -; CHECK-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; CHECK-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; CHECK-NEXT: addi.d $sp, $sp, 16 ; CHECK-NEXT: ret @@ -565,19 +454,19 @@ define void @test10(i32 signext %arg, i32 signext %arg1) nounwind { ; NORMV: # %bb.0: # %bb ; NORMV-NEXT: addi.d $sp, $sp, -16 ; NORMV-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; NORMV-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill -; NORMV-NEXT: sra.w $fp, $a0, $a1 +; NORMV-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill +; NORMV-NEXT: sra.w $a0, $a0, $a1 +; NORMV-NEXT: movgr2fr.w $fs0, $zero ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB9_1: # %bb2 ; NORMV-NEXT: # =>This Inner Loop Header: Depth=1 -; NORMV-NEXT: addi.w $a0, $fp, 0 +; NORMV-NEXT: addi.w $a0, $a0, 0 ; NORMV-NEXT: bl %plt(baz) -; NORMV-NEXT: move $fp, $a0 -; NORMV-NEXT: move $a1, $zero -; NORMV-NEXT: bl %plt(__nesf2) -; NORMV-NEXT: bnez $a0, .LBB9_1 +; NORMV-NEXT: fcmp.cune.s $fcc0, $fa0, $fs0 +; NORMV-NEXT: movfr2gr.s $a0, $fa0 +; NORMV-NEXT: bcnez $fcc0, .LBB9_1 ; NORMV-NEXT: # %bb.2: # %bb7 -; NORMV-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; NORMV-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; NORMV-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; NORMV-NEXT: addi.d $sp, $sp, 16 ; NORMV-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll b/llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll index 4eb34bfa09acb..ae8c0a6a15ed6 100644 --- a/llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll +++ b/llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32 -; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64 +; RUN: llc --mtriple=loongarch64 --mattr=-f < %s | FileCheck %s --check-prefix=LA64 define i32 @fptosi_i32_fp128(fp128 %X) nounwind { ; LA32-LABEL: fptosi_i32_fp128: diff --git a/llvm/test/CodeGen/LoongArch/statepoint-call-lowering-r1.ll b/llvm/test/CodeGen/LoongArch/statepoint-call-lowering-r1.ll index 4a77b2c00f54c..ee55ed337a28c 100644 --- a/llvm/test/CodeGen/LoongArch/statepoint-call-lowering-r1.ll +++ b/llvm/test/CodeGen/LoongArch/statepoint-call-lowering-r1.ll @@ -5,7 +5,7 @@ define void @test() gc "statepoint-example" { entry: %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" ()] -; CHECK: STATEPOINT 0, 0, 0, target-flags(loongarch-call-plt) @return_i1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, csr_ilp32s_lp64s, implicit-def $r3, implicit-def dead early-clobber $r1 +; CHECK: STATEPOINT 0, 0, 0, target-flags(loongarch-call-plt) @return_i1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, csr_ilp32d_lp64d, implicit-def $r3, implicit-def dead early-clobber $r1 ret void } diff --git a/llvm/test/CodeGen/LoongArch/statepoint-call-lowering.ll b/llvm/test/CodeGen/LoongArch/statepoint-call-lowering.ll index 6956929e721d7..e5febe62ae8a8 100644 --- a/llvm/test/CodeGen/LoongArch/statepoint-call-lowering.ll +++ b/llvm/test/CodeGen/LoongArch/statepoint-call-lowering.ll @@ -200,10 +200,8 @@ define void @test_attributes(ptr byval(%struct2) %s) nounwind gc "statepoint-exa ; CHECK-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill ; CHECK-NEXT: ld.d $a1, $a0, 16 ; CHECK-NEXT: st.d $a1, $sp, 16 -; CHECK-NEXT: ld.d $a1, $a0, 8 -; CHECK-NEXT: st.d $a1, $sp, 8 -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: st.d $a0, $sp, 0 +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vst $vr0, $sp, 0 ; CHECK-NEXT: ori $a0, $zero, 42 ; CHECK-NEXT: ori $a2, $zero, 17 ; CHECK-NEXT: addi.d $a3, $sp, 0 diff --git a/llvm/test/CodeGen/LoongArch/tail-calls.ll b/llvm/test/CodeGen/LoongArch/tail-calls.ll index 8298d76d8e3a6..7f315ee897b1c 100644 --- a/llvm/test/CodeGen/LoongArch/tail-calls.ll +++ b/llvm/test/CodeGen/LoongArch/tail-calls.ll @@ -103,8 +103,8 @@ define void @caller_indirect_args() nounwind { ; CHECK-NEXT: addi.d $sp, $sp, -48 ; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill ; CHECK-NEXT: st.d $zero, $sp, 24 -; CHECK-NEXT: st.d $zero, $sp, 16 -; CHECK-NEXT: st.d $zero, $sp, 8 +; CHECK-NEXT: vrepli.b $vr0, 0 +; CHECK-NEXT: vst $vr0, $sp, 8 ; CHECK-NEXT: ori $a1, $zero, 1 ; CHECK-NEXT: addi.d $a0, $sp, 0 ; CHECK-NEXT: st.d $a1, $sp, 0 diff --git a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll index 36b78ea2ea02c..147b792361478 100644 --- a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll +++ b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll @@ -66,14 +66,28 @@ define float @f(float %a) { ; ; LP64D-LABEL: f: ; LP64D: # %bb.0: -; LP64D-NEXT: addi.w $a0, $zero, 1 -; LP64D-NEXT: movgr2fr.w $fa1, $a0 -; LP64D-NEXT: ffint.s.w $fa1, $fa1 +; LP64D-NEXT: vldi $vr1, -1168 ; LP64D-NEXT: fadd.s $fa0, $fa0, $fa1 ; LP64D-NEXT: ret ; -; LP64S-LABEL: f: -; LP64S: bl %plt(__addsf3) +; LP64S-LP64F-NOF-LABEL: f: +; LP64S-LP64F-NOF: bl %plt(__addsf3) +; +; LP64S-LP64D-NOD-LABEL: f: +; LP64S-LP64D-NOD: # %bb.0: +; LP64S-LP64D-NOD-NEXT: movgr2fr.w $fa0, $a0 +; LP64S-LP64D-NOD-NEXT: addi.w $a0, $zero, 1 +; LP64S-LP64D-NOD-NEXT: movgr2fr.w $fa1, $a0 +; LP64S-LP64D-NOD-NEXT: ffint.s.w $fa1, $fa1 +; LP64S-LP64D-NOD-NEXT: fadd.s $fa0, $fa0, $fa1 +; LP64S-LP64D-NOD-NEXT: movfr2gr.s $a0, $fa0 +; LP64S-LP64D-NOD-NEXT: ret +; +; LP64D-LP64F-NOF-LABEL: f: +; LP64D-LP64F-NOF: bl %plt(__addsf3) +; +; LP64D-NONE-NOF-LABEL: f: +; LP64D-NONE-NOF: bl %plt(__addsf3) %1 = fadd float %a, 1.0 ret float %1 } @@ -90,9 +104,7 @@ define double @g(double %a) { ; ; LP64D-LABEL: g: ; LP64D: # %bb.0: -; LP64D-NEXT: addi.d $a0, $zero, 1 -; LP64D-NEXT: movgr2fr.d $fa1, $a0 -; LP64D-NEXT: ffint.d.l $fa1, $fa1 +; LP64D-NEXT: vldi $vr1, -912 ; LP64D-NEXT: fadd.d $fa0, $fa0, $fa1 ; LP64D-NEXT: ret ; diff --git a/llvm/test/CodeGen/LoongArch/target-abi-from-triple.ll b/llvm/test/CodeGen/LoongArch/target-abi-from-triple.ll index 0aca339038860..c8a33725267a2 100644 --- a/llvm/test/CodeGen/LoongArch/target-abi-from-triple.ll +++ b/llvm/test/CodeGen/LoongArch/target-abi-from-triple.ll @@ -18,9 +18,7 @@ define float @f(float %a) { ; ; LP64D-LABEL: f: ; LP64D: # %bb.0: -; LP64D-NEXT: addi.w $a0, $zero, 1 -; LP64D-NEXT: movgr2fr.w $fa1, $a0 -; LP64D-NEXT: ffint.s.w $fa1, $fa1 +; LP64D-NEXT: vldi $vr1, -1168 ; LP64D-NEXT: fadd.s $fa0, $fa0, $fa1 ; LP64D-NEXT: ret %1 = fadd float %a, 1.0 @@ -39,9 +37,7 @@ define double @g(double %a) { ; ; LP64D-LABEL: g: ; LP64D: # %bb.0: -; LP64D-NEXT: addi.d $a0, $zero, 1 -; LP64D-NEXT: movgr2fr.d $fa1, $a0 -; LP64D-NEXT: ffint.d.l $fa1, $fa1 +; LP64D-NEXT: vldi $vr1, -912 ; LP64D-NEXT: fadd.d $fa0, $fa0, $fa1 ; LP64D-NEXT: ret %1 = fadd double %a, 1.0 diff --git a/llvm/test/CodeGen/LoongArch/vararg.ll b/llvm/test/CodeGen/LoongArch/vararg.ll index bbf3b8e333306..f488610868eb3 100644 --- a/llvm/test/CodeGen/LoongArch/vararg.ll +++ b/llvm/test/CodeGen/LoongArch/vararg.ll @@ -289,8 +289,8 @@ define void @va_aligned_stack_caller() nounwind { ; LA64-FPELIM-NEXT: lu52i.d $a0, $a0, -328 ; LA64-FPELIM-NEXT: st.d $a0, $sp, 16 ; LA64-FPELIM-NEXT: st.d $zero, $sp, 88 -; LA64-FPELIM-NEXT: st.d $zero, $sp, 80 -; LA64-FPELIM-NEXT: st.d $zero, $sp, 72 +; LA64-FPELIM-NEXT: vrepli.b $vr0, 0 +; LA64-FPELIM-NEXT: vst $vr0, $sp, 72 ; LA64-FPELIM-NEXT: ori $a5, $zero, 1000 ; LA64-FPELIM-NEXT: ori $a0, $zero, 1 ; LA64-FPELIM-NEXT: ori $a1, $zero, 11 @@ -330,8 +330,8 @@ define void @va_aligned_stack_caller() nounwind { ; LA64-WITHFP-NEXT: lu52i.d $a0, $a0, -328 ; LA64-WITHFP-NEXT: st.d $a0, $sp, 16 ; LA64-WITHFP-NEXT: st.d $zero, $fp, -24 -; LA64-WITHFP-NEXT: st.d $zero, $fp, -32 -; LA64-WITHFP-NEXT: st.d $zero, $fp, -40 +; LA64-WITHFP-NEXT: vrepli.b $vr0, 0 +; LA64-WITHFP-NEXT: vst $vr0, $fp, -40 ; LA64-WITHFP-NEXT: ori $a5, $zero, 1000 ; LA64-WITHFP-NEXT: ori $a0, $zero, 1 ; LA64-WITHFP-NEXT: ori $a1, $zero, 11 diff --git a/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll b/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll index 8dd1ec465c13a..3e1b6d8eaadbc 100644 --- a/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll +++ b/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll @@ -67,19 +67,10 @@ define void @test_zero(ptr %P, ptr %S) nounwind { ; ; LA64D-LABEL: test_zero: ; LA64D: # %bb.0: -; LA64D-NEXT: fld.s $fa0, $a0, 12 -; LA64D-NEXT: fld.s $fa1, $a0, 0 -; LA64D-NEXT: fld.s $fa2, $a0, 4 -; LA64D-NEXT: fld.s $fa3, $a0, 8 -; LA64D-NEXT: movgr2fr.w $fa4, $zero -; LA64D-NEXT: fadd.s $fa1, $fa1, $fa4 -; LA64D-NEXT: fadd.s $fa2, $fa2, $fa4 -; LA64D-NEXT: fadd.s $fa3, $fa3, $fa4 -; LA64D-NEXT: fadd.s $fa0, $fa0, $fa4 -; LA64D-NEXT: fst.s $fa0, $a1, 12 -; LA64D-NEXT: fst.s $fa3, $a1, 8 -; LA64D-NEXT: fst.s $fa2, $a1, 4 -; LA64D-NEXT: fst.s $fa1, $a1, 0 +; LA64D-NEXT: vld $vr0, $a0, 0 +; LA64D-NEXT: vrepli.b $vr1, 0 +; LA64D-NEXT: vfadd.s $vr0, $vr0, $vr1 +; LA64D-NEXT: vst $vr0, $a1, 0 ; LA64D-NEXT: ret %p = load %f4, ptr %P %R = fadd %f4 %p, zeroinitializer @@ -135,17 +126,17 @@ define void @test_f2(ptr %P, ptr %S) nounwind { ; ; LA64D-LABEL: test_f2: ; LA64D: # %bb.0: -; LA64D-NEXT: fld.s $fa0, $a0, 4 -; LA64D-NEXT: fld.s $fa1, $a0, 0 -; LA64D-NEXT: addi.w $a0, $zero, 1 -; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI1_0) -; LA64D-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI1_0) -; LA64D-NEXT: movgr2fr.w $fa3, $a0 -; LA64D-NEXT: ffint.s.w $fa3, $fa3 -; LA64D-NEXT: fadd.s $fa1, $fa1, $fa3 -; LA64D-NEXT: fadd.s $fa0, $fa0, $fa2 -; LA64D-NEXT: fst.s $fa0, $a1, 4 -; LA64D-NEXT: fst.s $fa1, $a1, 0 +; LA64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-NEXT: ld.d $a0, $a0, 0 +; LA64D-NEXT: st.d $a0, $sp, 0 +; LA64D-NEXT: vld $vr0, $sp, 0 +; LA64D-NEXT: lu12i.w $a0, 260096 +; LA64D-NEXT: lu52i.d $a0, $a0, 1024 +; LA64D-NEXT: vreplgr2vr.d $vr1, $a0 +; LA64D-NEXT: vfadd.s $vr0, $vr0, $vr1 +; LA64D-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64D-NEXT: st.d $a0, $a1, 0 +; LA64D-NEXT: addi.d $sp, $sp, 16 ; LA64D-NEXT: ret %p = load %f2, ptr %P %R = fadd %f2 %p, < float 1.000000e+00, float 2.000000e+00 > @@ -231,27 +222,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind { ; ; LA64D-LABEL: test_f4: ; LA64D: # %bb.0: -; LA64D-NEXT: fld.s $fa0, $a0, 12 -; LA64D-NEXT: fld.s $fa1, $a0, 8 -; LA64D-NEXT: fld.s $fa2, $a0, 4 -; LA64D-NEXT: fld.s $fa3, $a0, 0 -; LA64D-NEXT: addi.w $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa4, $a0 -; LA64D-NEXT: ffint.s.w $fa4, $fa4 +; LA64D-NEXT: vld $vr0, $a0, 0 ; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) -; LA64D-NEXT: fld.s $fa5, $a0, %pc_lo12(.LCPI2_0) -; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_1) -; LA64D-NEXT: fld.s $fa6, $a0, %pc_lo12(.LCPI2_1) -; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_2) -; LA64D-NEXT: fld.s $fa7, $a0, %pc_lo12(.LCPI2_2) -; LA64D-NEXT: fadd.s $fa3, $fa3, $fa4 -; LA64D-NEXT: fadd.s $fa2, $fa2, $fa5 -; LA64D-NEXT: fadd.s $fa1, $fa1, $fa6 -; LA64D-NEXT: fadd.s $fa0, $fa0, $fa7 -; LA64D-NEXT: fst.s $fa0, $a1, 12 -; LA64D-NEXT: fst.s $fa1, $a1, 8 -; LA64D-NEXT: fst.s $fa2, $a1, 4 -; LA64D-NEXT: fst.s $fa3, $a1, 0 +; LA64D-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) +; LA64D-NEXT: vfadd.s $vr0, $vr0, $vr1 +; LA64D-NEXT: vst $vr0, $a1, 0 ; LA64D-NEXT: ret %p = load %f4, ptr %P %R = fadd %f4 %p, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 > @@ -373,39 +348,14 @@ define void @test_f8(ptr %P, ptr %S) nounwind { ; ; LA64D-LABEL: test_f8: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.w $a2, $zero, 1 -; LA64D-NEXT: movgr2fr.w $fa0, $a2 +; LA64D-NEXT: vld $vr0, $a0, 16 ; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) -; LA64D-NEXT: fld.s $fa1, $a2, %pc_lo12(.LCPI3_0) -; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) -; LA64D-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI3_1) -; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_2) -; LA64D-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI3_2) -; LA64D-NEXT: fld.s $fa4, $a0, 28 -; LA64D-NEXT: fld.s $fa5, $a0, 24 -; LA64D-NEXT: fld.s $fa6, $a0, 12 -; LA64D-NEXT: fld.s $fa7, $a0, 8 -; LA64D-NEXT: fld.s $ft0, $a0, 0 -; LA64D-NEXT: fld.s $ft1, $a0, 16 -; LA64D-NEXT: fld.s $ft2, $a0, 4 -; LA64D-NEXT: ffint.s.w $fa0, $fa0 -; LA64D-NEXT: fadd.s $ft0, $ft0, $fa0 -; LA64D-NEXT: fadd.s $fa0, $ft1, $fa0 -; LA64D-NEXT: fld.s $ft1, $a0, 20 -; LA64D-NEXT: fadd.s $ft2, $ft2, $fa1 -; LA64D-NEXT: fadd.s $fa7, $fa7, $fa2 -; LA64D-NEXT: fadd.s $fa6, $fa6, $fa3 -; LA64D-NEXT: fadd.s $fa1, $ft1, $fa1 -; LA64D-NEXT: fadd.s $fa2, $fa5, $fa2 -; LA64D-NEXT: fadd.s $fa3, $fa4, $fa3 -; LA64D-NEXT: fst.s $fa3, $a1, 28 -; LA64D-NEXT: fst.s $fa2, $a1, 24 -; LA64D-NEXT: fst.s $fa1, $a1, 20 -; LA64D-NEXT: fst.s $fa6, $a1, 12 -; LA64D-NEXT: fst.s $fa7, $a1, 8 -; LA64D-NEXT: fst.s $ft2, $a1, 4 -; LA64D-NEXT: fst.s $fa0, $a1, 16 -; LA64D-NEXT: fst.s $ft0, $a1, 0 +; LA64D-NEXT: vld $vr1, $a2, %pc_lo12(.LCPI3_0) +; LA64D-NEXT: vld $vr2, $a0, 0 +; LA64D-NEXT: vfadd.s $vr0, $vr0, $vr1 +; LA64D-NEXT: vfadd.s $vr1, $vr2, $vr1 +; LA64D-NEXT: vst $vr1, $a1, 0 +; LA64D-NEXT: vst $vr0, $a1, 16 ; LA64D-NEXT: ret %p = load %f8, ptr %P %R = fadd %f8 %p, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 > @@ -496,17 +446,11 @@ define void @test_d2(ptr %P, ptr %S) nounwind { ; ; LA64D-LABEL: test_d2: ; LA64D: # %bb.0: -; LA64D-NEXT: fld.d $fa0, $a0, 8 -; LA64D-NEXT: fld.d $fa1, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI4_0) -; LA64D-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI4_0) -; LA64D-NEXT: movgr2fr.d $fa3, $a0 -; LA64D-NEXT: ffint.d.l $fa3, $fa3 -; LA64D-NEXT: fadd.d $fa1, $fa1, $fa3 -; LA64D-NEXT: fadd.d $fa0, $fa0, $fa2 -; LA64D-NEXT: fst.d $fa0, $a1, 8 -; LA64D-NEXT: fst.d $fa1, $a1, 0 +; LA64D-NEXT: vld $vr0, $a0, 0 +; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) +; LA64D-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) +; LA64D-NEXT: vfadd.d $vr0, $vr0, $vr1 +; LA64D-NEXT: vst $vr0, $a1, 0 ; LA64D-NEXT: ret %p = load %d2, ptr %P %R = fadd %d2 %p, < double 1.000000e+00, double 2.000000e+00 > @@ -655,27 +599,16 @@ define void @test_d4(ptr %P, ptr %S) nounwind { ; ; LA64D-LABEL: test_d4: ; LA64D: # %bb.0: -; LA64D-NEXT: fld.d $fa0, $a0, 24 -; LA64D-NEXT: fld.d $fa1, $a0, 16 -; LA64D-NEXT: fld.d $fa2, $a0, 8 -; LA64D-NEXT: fld.d $fa3, $a0, 0 -; LA64D-NEXT: addi.d $a0, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa4, $a0 -; LA64D-NEXT: ffint.d.l $fa4, $fa4 +; LA64D-NEXT: vld $vr0, $a0, 0 +; LA64D-NEXT: vld $vr1, $a0, 16 ; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0) -; LA64D-NEXT: fld.d $fa5, $a0, %pc_lo12(.LCPI5_0) +; LA64D-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI5_0) ; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_1) -; LA64D-NEXT: fld.d $fa6, $a0, %pc_lo12(.LCPI5_1) -; LA64D-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_2) -; LA64D-NEXT: fld.d $fa7, $a0, %pc_lo12(.LCPI5_2) -; LA64D-NEXT: fadd.d $fa3, $fa3, $fa4 -; LA64D-NEXT: fadd.d $fa2, $fa2, $fa5 -; LA64D-NEXT: fadd.d $fa1, $fa1, $fa6 -; LA64D-NEXT: fadd.d $fa0, $fa0, $fa7 -; LA64D-NEXT: fst.d $fa0, $a1, 24 -; LA64D-NEXT: fst.d $fa1, $a1, 16 -; LA64D-NEXT: fst.d $fa2, $a1, 8 -; LA64D-NEXT: fst.d $fa3, $a1, 0 +; LA64D-NEXT: vld $vr3, $a0, %pc_lo12(.LCPI5_1) +; LA64D-NEXT: vfadd.d $vr1, $vr1, $vr2 +; LA64D-NEXT: vfadd.d $vr0, $vr0, $vr3 +; LA64D-NEXT: vst $vr0, $a1, 0 +; LA64D-NEXT: vst $vr1, $a1, 16 ; LA64D-NEXT: ret %p = load %d4, ptr %P %R = fadd %d4 %p, < double 1.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00 > @@ -934,39 +867,22 @@ define void @test_d8(ptr %P, ptr %S) nounwind { ; ; LA64D-LABEL: test_d8: ; LA64D: # %bb.0: -; LA64D-NEXT: addi.d $a2, $zero, 1 -; LA64D-NEXT: movgr2fr.d $fa0, $a2 ; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI6_0) -; LA64D-NEXT: fld.d $fa1, $a2, %pc_lo12(.LCPI6_0) +; LA64D-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI6_0) ; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI6_1) -; LA64D-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI6_1) -; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI6_2) -; LA64D-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI6_2) -; LA64D-NEXT: fld.d $fa4, $a0, 56 -; LA64D-NEXT: fld.d $fa5, $a0, 48 -; LA64D-NEXT: fld.d $fa6, $a0, 24 -; LA64D-NEXT: fld.d $fa7, $a0, 16 -; LA64D-NEXT: fld.d $ft0, $a0, 0 -; LA64D-NEXT: fld.d $ft1, $a0, 32 -; LA64D-NEXT: fld.d $ft2, $a0, 8 -; LA64D-NEXT: ffint.d.l $fa0, $fa0 -; LA64D-NEXT: fadd.d $ft0, $ft0, $fa0 -; LA64D-NEXT: fadd.d $fa0, $ft1, $fa0 -; LA64D-NEXT: fld.d $ft1, $a0, 40 -; LA64D-NEXT: fadd.d $ft2, $ft2, $fa1 -; LA64D-NEXT: fadd.d $fa7, $fa7, $fa2 -; LA64D-NEXT: fadd.d $fa6, $fa6, $fa3 -; LA64D-NEXT: fadd.d $fa1, $ft1, $fa1 -; LA64D-NEXT: fadd.d $fa2, $fa5, $fa2 -; LA64D-NEXT: fadd.d $fa3, $fa4, $fa3 -; LA64D-NEXT: fst.d $fa3, $a1, 56 -; LA64D-NEXT: fst.d $fa2, $a1, 48 -; LA64D-NEXT: fst.d $fa1, $a1, 40 -; LA64D-NEXT: fst.d $fa6, $a1, 24 -; LA64D-NEXT: fst.d $fa7, $a1, 16 -; LA64D-NEXT: fst.d $ft2, $a1, 8 -; LA64D-NEXT: fst.d $fa0, $a1, 32 -; LA64D-NEXT: fst.d $ft0, $a1, 0 +; LA64D-NEXT: vld $vr1, $a2, %pc_lo12(.LCPI6_1) +; LA64D-NEXT: vld $vr2, $a0, 16 +; LA64D-NEXT: vld $vr3, $a0, 0 +; LA64D-NEXT: vld $vr4, $a0, 48 +; LA64D-NEXT: vld $vr5, $a0, 32 +; LA64D-NEXT: vfadd.d $vr2, $vr2, $vr0 +; LA64D-NEXT: vfadd.d $vr3, $vr3, $vr1 +; LA64D-NEXT: vfadd.d $vr0, $vr4, $vr0 +; LA64D-NEXT: vfadd.d $vr1, $vr5, $vr1 +; LA64D-NEXT: vst $vr1, $a1, 32 +; LA64D-NEXT: vst $vr0, $a1, 48 +; LA64D-NEXT: vst $vr3, $a1, 0 +; LA64D-NEXT: vst $vr2, $a1, 16 ; LA64D-NEXT: ret %p = load %d8, ptr %P %R = fadd %d8 %p, < double 1.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00 > diff --git a/llvm/test/CodeGen/Mips/no-odd-spreg.ll b/llvm/test/CodeGen/Mips/no-odd-spreg.ll index 467e458ba82db..b6a687dcfe17a 100644 --- a/llvm/test/CodeGen/Mips/no-odd-spreg.ll +++ b/llvm/test/CodeGen/Mips/no-odd-spreg.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=mipsel -mcpu=mips32 < %s \ +; RUN: llc -mtriple=mipsel-elf -mcpu=mips32 < %s \ ; RUN: | FileCheck %s -check-prefixes=ALL,ODDSPREG,ODDSPREG-NO-EMIT -; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+nooddspreg < %s \ +; RUN: llc -mtriple=mipsel-elf -mcpu=mips32 -mattr=+nooddspreg < %s \ ; RUN: | FileCheck %s -check-prefixes=ALL,NOODDSPREG -; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64 < %s \ +; RUN: llc -mtriple=mipsel-elf -mcpu=mips32r6 -mattr=fp64 < %s \ ; RUN: | FileCheck %s -check-prefixes=ALL,ODDSPREG,ODDSPREG-NO-EMIT -; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64,+nooddspreg < %s \ +; RUN: llc -mtriple=mipsel-elf -mcpu=mips32r6 -mattr=fp64,+nooddspreg < %s \ ; RUN: | FileCheck %s -check-prefixes=ALL,NOODDSPREG -; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fpxx,-nooddspreg < %s \ +; RUN: llc -mtriple=mipsel-elf -mcpu=mips32r6 -mattr=fpxx,-nooddspreg < %s \ ; RUN: | FileCheck %s -check-prefixes=ALL,ODDSPREG,ODDSPREG-EMIT ; We don't emit a directive unless we need to. This is to support versions of diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll new file mode 100644 index 0000000000000..df9c3e59b0e6b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | FileCheck %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_90 \ +; RUN: %} + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: test_bitcast_2xi8_i16( +; CHECK: ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0]; +; CHECK: mov.b32 {%rs1, %rs2}, %r1; +; CHECK: shl.b16 %rs3, %rs2, 8; +; CHECK: and.b16 %rs4, %rs1, 255; +; CHECK: or.b16 %rs5, %rs4, %rs3; +; CHECK: cvt.u32.u16 %r2, %rs5; +; CHECK: st.param.b32 [func_retval0], %r2; +define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) { + %res = bitcast <2 x i8> %a to i16 + ret i16 %res +} + +; CHECK-LABEL: test_bitcast_i16_2xi8( +; CHECK: ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0]; +; CHECK: shr.u16 %rs2, %rs1, 8; +; CHECK: mov.b32 %r1, {%rs1, %rs2}; +; CHECK: st.param.b32 [func_retval0], %r1; +define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) { + %res = bitcast i16 %a to <2 x i8> + ret <2 x i8> %res +} diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 5b5662a1eea76..a16a5b435962d 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_add( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 13120; +; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; ; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, %b @@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { ; CHECK-LABEL: test_add_imm_0( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: add.s16 %rs2, %rs1, 4; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 2; +; CHECK-NEXT: add.s16 %rs4, %rs3, 3; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 3; +; CHECK-NEXT: add.s16 %rs6, %rs5, 2; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: add.s16 %rs8, %rs7, 4; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> , %a @@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { ; CHECK-LABEL: test_add_imm_1( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: add.s16 %rs2, %rs1, 4; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 2; +; CHECK-NEXT: add.s16 %rs4, %rs3, 3; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 3; +; CHECK-NEXT: add.s16 %rs6, %rs5, 2; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: add.s16 %rs8, %rs7, 4; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, @@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_sub( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 13120; +; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; ; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %r = sub <4 x i8> %a, %b @@ -251,38 +251,38 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<27>; +; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; ; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; -; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 13120; +; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; ; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; -; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; -; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; -; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; +; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; +; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b @@ -294,30 +294,30 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_umax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; ; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 13120; ; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ugt <4 x i8> %a, %b @@ -329,38 +329,38 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<27>; +; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; ; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; -; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 13120; +; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; ; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; -; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; -; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; -; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; +; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; +; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b @@ -372,30 +372,30 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_umin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.ls.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.ls.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.ls.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; ; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 13120; ; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ule <4 x i8> %a, %b @@ -407,35 +407,35 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-LABEL: test_eq( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<24>; +; CHECK-NEXT: .reg .b32 %r<23>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0]; -; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; ; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; ; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; -; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 13120; +; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; -; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; -; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; -; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; +; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; +; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; +; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 13120; +; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r22; ; CHECK-NEXT: ret; %cmp = icmp eq <4 x i8> %a, %b @@ -447,35 +447,35 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-LABEL: test_ne( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<24>; +; CHECK-NEXT: .reg .b32 %r<23>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0]; -; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; -; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 13120; +; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; -; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; -; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; -; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; +; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; +; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; +; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 13120; +; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r22; ; CHECK-NEXT: ret; %cmp = icmp ne <4 x i8> %a, %b @@ -487,38 +487,38 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_mul( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 13120; +; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; -; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; ; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r17; ; CHECK-NEXT: ret; %r = mul <4 x i8> %a, %b @@ -548,12 +548,13 @@ define <4 x i8> @test_or_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; -; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; -; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; -; CHECK-NEXT: or.b32 %r8, %r6, %r4; +; CHECK-NEXT: mov.b32 %r1, 0; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 13120; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; +; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; CHECK-NEXT: or.b32 %r8, %r6, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -613,12 +614,13 @@ define <4 x i8> @test_xor_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; -; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; -; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; -; CHECK-NEXT: xor.b32 %r8, %r6, %r4; +; CHECK-NEXT: mov.b32 %r1, 0; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 13120; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; +; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; CHECK-NEXT: xor.b32 %r8, %r6, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -678,12 +680,13 @@ define <4 x i8> @test_and_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; -; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; -; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; -; CHECK-NEXT: and.b32 %r8, %r6, %r4; +; CHECK-NEXT: mov.b32 %r1, 0; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 13120; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; +; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; CHECK-NEXT: and.b32 %r8, %r6, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -926,40 +929,40 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> ; CHECK-LABEL: test_select_cc( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<29>; +; CHECK-NEXT: .reg .b32 %r<28>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; ; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; -; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; -; CHECK-NEXT: bfe.u32 %r6, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r4, 0, 8; +; CHECK-NEXT: bfe.u32 %r6, %r3, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; -; CHECK-NEXT: bfe.u32 %r8, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r4, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r3, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r4, 8, 8; -; CHECK-NEXT: bfe.u32 %r10, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r4, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r3, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r4, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r4, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11; -; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r13, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; ; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4; -; CHECK-NEXT: bfe.u32 %r16, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r17, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r16, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3; -; CHECK-NEXT: bfi.b32 %r19, %r18, %r15, 8, 8; -; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r21, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r19, %r18, %r15, 13120; +; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r21, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2; -; CHECK-NEXT: bfi.b32 %r23, %r22, %r19, 16, 8; -; CHECK-NEXT: bfe.u32 %r24, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r25, %r1, 24, 8; -; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1; -; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8; +; CHECK-NEXT: bfe.u32 %r23, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r24, %r1, 0, 8; +; CHECK-NEXT: selp.b32 %r25, %r24, %r23, %p1; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r22, 13120; +; CHECK-NEXT: prmt.b32 %r27, %r26, %r19, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r27; ; CHECK-NEXT: ret; %cc = icmp ne <4 x i8> %c, %d @@ -1006,32 +1009,32 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, ; CHECK-LABEL: test_select_cc_i8_i32( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<27>; +; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; ; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0]; -; CHECK-NEXT: setp.ne.s32 %p1, %r6, %r10; -; CHECK-NEXT: setp.ne.s32 %p2, %r5, %r9; -; CHECK-NEXT: setp.ne.s32 %p3, %r4, %r8; -; CHECK-NEXT: setp.ne.s32 %p4, %r3, %r7; -; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8; +; CHECK-NEXT: setp.ne.s32 %p1, %r3, %r7; +; CHECK-NEXT: setp.ne.s32 %p2, %r4, %r8; +; CHECK-NEXT: setp.ne.s32 %p3, %r5, %r9; +; CHECK-NEXT: setp.ne.s32 %p4, %r6, %r10; +; CHECK-NEXT: bfe.u32 %r11, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 24, 8; ; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r15, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 8, 8; -; CHECK-NEXT: bfe.u32 %r18, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r19, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 13120; +; CHECK-NEXT: bfe.u32 %r18, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2; -; CHECK-NEXT: bfi.b32 %r21, %r20, %r17, 16, 8; -; CHECK-NEXT: bfe.u32 %r22, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r23, %r1, 24, 8; -; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1; -; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8; +; CHECK-NEXT: bfe.u32 %r21, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8; +; CHECK-NEXT: selp.b32 %r23, %r22, %r21, %p1; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r20, 13120; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r17, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r25; ; CHECK-NEXT: ret; <4 x i32> %c, <4 x i32> %d) #0 { @@ -1044,13 +1047,13 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { ; CHECK-LABEL: test_trunc_2xi32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0]; -; CHECK-NEXT: bfi.b32 %r5, %r2, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r6, %r3, %r5, 16, 8; -; CHECK-NEXT: bfi.b32 %r7, %r4, %r6, 24, 8; +; CHECK-NEXT: prmt.b32 %r5, %r3, %r4, 13120; +; CHECK-NEXT: prmt.b32 %r6, %r1, %r2, 13120; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i32> %a to <4 x i8> @@ -1060,19 +1063,19 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { ; CHECK-LABEL: test_trunc_2xi64( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; ; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; -; CHECK-NEXT: cvt.u32.u64 %r1, %rd1; -; CHECK-NEXT: cvt.u32.u64 %r2, %rd2; -; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; -; CHECK-NEXT: cvt.u32.u64 %r4, %rd3; -; CHECK-NEXT: bfi.b32 %r5, %r4, %r3, 16, 8; -; CHECK-NEXT: cvt.u32.u64 %r6, %rd4; -; CHECK-NEXT: bfi.b32 %r7, %r6, %r5, 24, 8; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd4; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd3; +; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 13120; +; CHECK-NEXT: cvt.u32.u64 %r4, %rd2; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd1; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 13120; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i64> %a to <4 x i8> @@ -1184,15 +1187,16 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; -; CHECK-NEXT: bfi.b32 %r2, 5, %r1, 8, 8; -; CHECK-NEXT: bfi.b32 %r3, 6, %r2, 16, 8; -; CHECK-NEXT: bfi.b32 %r4, 7, %r3, 24, 8; -; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: mov.b32 %r1, 6; +; CHECK-NEXT: prmt.b32 %r2, %r1, 7, 13120; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r3, 5, 13120; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -1255,27 +1259,27 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; -; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; +; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 13120; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; -; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; -; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 13120; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %r = fptosi <4 x half> %a to <4 x i8> @@ -1286,27 +1290,27 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; -; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; +; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 13120; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; -; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; -; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 13120; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r13; ; CHECK-NEXT: ret; %r = fptoui <4 x half> %a to <4 x i8> @@ -1326,33 +1330,33 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r3; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; ; CHECK-NEXT: cvt.s8.s32 %rs2, %r4; ; CHECK-NEXT: rem.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs4, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs5, %r7; ; CHECK-NEXT: rem.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; -; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 13120; +; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs7, %r10; -; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r11, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs8, %r11; ; CHECK-NEXT: rem.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; -; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; -; CHECK-NEXT: cvt.s8.s32 %rs10, %r14; -; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; -; CHECK-NEXT: cvt.s8.s32 %rs11, %r15; +; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8; +; CHECK-NEXT: cvt.s8.s32 %rs10, %r13; +; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8; +; CHECK-NEXT: cvt.s8.s32 %rs11, %r14; ; CHECK-NEXT: rem.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; ; CHECK-NEXT: st.u32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: @@ -1373,7 +1377,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: test_srem_v3i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b32 %r<17>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry @@ -1392,25 +1396,25 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs9; ; CHECK-NEXT: ld.s8 %rs10, [%rd2+2]; -; CHECK-NEXT: bfe.s32 %r5, %r3, 0, 8; +; CHECK-NEXT: bfe.s32 %r5, %r3, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs11, %r5; -; CHECK-NEXT: bfe.s32 %r6, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs12, %r6; ; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11; ; CHECK-NEXT: cvt.u32.u16 %r7, %rs13; -; CHECK-NEXT: bfe.s32 %r8, %r3, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r3, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs14, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs15, %r9; ; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14; ; CHECK-NEXT: cvt.u32.u16 %r10, %rs16; -; CHECK-NEXT: bfi.b32 %r11, %r10, %r7, 8, 8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r7, 13120; ; CHECK-NEXT: // implicit-def: %r13 -; CHECK-NEXT: bfi.b32 %r12, %r13, %r11, 16, 8; -; CHECK-NEXT: // implicit-def: %r15 -; CHECK-NEXT: bfi.b32 %r14, %r15, %r12, 24, 8; +; CHECK-NEXT: // implicit-def: %r14 +; CHECK-NEXT: prmt.b32 %r12, %r13, %r14, 13120; +; CHECK-NEXT: prmt.b32 %r15, %r11, %r12, 21520; ; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10; -; CHECK-NEXT: cvt.u16.u32 %rs18, %r14; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r15; } ; CHECK-NEXT: st.u8 [%rd3], %rs18; ; CHECK-NEXT: shr.u16 %rs19, %rs18, 8; ; CHECK-NEXT: st.u8 [%rd3+1], %rs19; @@ -1437,25 +1441,25 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.s32 %r11, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r12, -1, 0, %p3; -; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 13120; ; CHECK-NEXT: selp.s32 %r14, -1, 0, %p2; -; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; -; CHECK-NEXT: selp.s32 %r16, -1, 0, %p1; -; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; +; CHECK-NEXT: selp.s32 %r15, -1, 0, %p1; +; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; +; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; ; CHECK-NEXT: st.u32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll index 0cb0c1ba8c6bd..0a9fc724945c5 100644 --- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -33,35 +33,35 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: and.b16 %rs2, %rs1, 255; ; CHECK-NEXT: setp.eq.s16 %p1, %rs2, 0; -; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r3; ; CHECK-NEXT: and.b16 %rs4, %rs3, 255; ; CHECK-NEXT: setp.eq.s16 %p2, %rs4, 0; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r4; ; CHECK-NEXT: and.b16 %rs6, %rs5, 255; ; CHECK-NEXT: setp.eq.s16 %p3, %rs6, 0; -; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r5; ; CHECK-NEXT: and.b16 %rs8, %rs7, 255; ; CHECK-NEXT: setp.eq.s16 %p4, %rs8, 0; ; CHECK-NEXT: selp.s32 %r6, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r7, -1, 0, %p3; -; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 13120; ; CHECK-NEXT: selp.s32 %r9, -1, 0, %p2; -; CHECK-NEXT: bfi.b32 %r10, %r9, %r8, 16, 8; -; CHECK-NEXT: selp.s32 %r11, -1, 0, %p1; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r10, 24, 8; +; CHECK-NEXT: selp.s32 %r10, -1, 0, %p1; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r9, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r8, 21520; ; CHECK-NEXT: st.param.b32 [func_retval0], %r12; ; CHECK-NEXT: ret; entry: diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll index 21bd4bb8502c3..8aeea4ba045bf 100644 --- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -70,9 +70,6 @@ ; CHECK-NEXT: CodeGen Prepare ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Exception handling preparation -; CHECK-NEXT: PPC Merge String Pool -; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Prepare loop for ppc preferred instruction forms diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll index c7b1d2a0771c1..f14901017982b 100644 --- a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll @@ -1,13 +1,13 @@ ; This file tests the codegen of mergeable const in AIX assembly. ; This file also tests mergeable const in XCOFF object file generation. ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \ -; RUN: -data-sections=false -xcoff-traceback-table=false < %s | \ +; RUN: -global-merge-all-const=false -data-sections=false -xcoff-traceback-table=false < %s | \ ; RUN: FileCheck --check-prefixes=CHECK,CHECK32 %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc64-ibm-aix-xcoff \ -; RUN: -xcoff-traceback-table=false -data-sections=false < %s | \ +; RUN: -global-merge-all-const=false -xcoff-traceback-table=false -data-sections=false < %s | \ ; RUN: FileCheck --check-prefixes=CHECK,CHECK64 %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \ -; RUN: -xcoff-traceback-table=false -data-sections=false -filetype=obj -o %t.o < %s +; RUN: -global-merge-all-const=false -xcoff-traceback-table=false -data-sections=false -filetype=obj -o %t.o < %s ; RUN: llvm-objdump -D %t.o | FileCheck --check-prefix=CHECKOBJ %s ; RUN: llvm-readobj -s %t.o | FileCheck --check-prefix=CHECKSYM %s diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-str.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-str.ll index f70428b102895..5462240846994 100644 --- a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-str.ll +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-str.ll @@ -4,12 +4,15 @@ ; tests for XCOFF object files. ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -xcoff-traceback-table=false \ -; RUN: -mtriple powerpc-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false < %s | FileCheck %s +; RUN: -mtriple powerpc-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false \ +; RUN: -global-merge-all-const=false < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -xcoff-traceback-table=false \ -; RUN: -mtriple powerpc64-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false < %s | FileCheck %s +; RUN: -mtriple powerpc64-ibm-aix-xcoff -data-sections=false -ppc-merge-string-pool=false \ +; RUN: -global-merge-all-const=false < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec -mtriple powerpc-ibm-aix-xcoff \ -; RUN: -xcoff-traceback-table=false -data-sections=false -ppc-merge-string-pool=false -filetype=obj -o %t.o < %s +; RUN: -xcoff-traceback-table=false -data-sections=false -ppc-merge-string-pool=false \ +; RUN: -global-merge-all-const=false -filetype=obj -o %t.o < %s ; RUN: llvm-objdump -D %t.o | FileCheck --check-prefix=CHECKOBJ %s @magic16 = private unnamed_addr constant [4 x i16] [i16 264, i16 272, i16 213, i16 0], align 2 diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-used-with-stringpool.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-used-with-stringpool.ll index fa9a8fb457518..7f93661c37ee8 100644 --- a/llvm/test/CodeGen/PowerPC/aix-xcoff-used-with-stringpool.ll +++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-used-with-stringpool.ll @@ -1,10 +1,12 @@ ;; Test that the string pooling pass does not pool globals that are ;; in llvm.used or in llvm.compiler.used. -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-ibm-aix-xcoff -data-sections=false < %s | \ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc-ibm-aix-xcoff \ +; RUN: -ppc-merge-string-pool=true -global-merge-all-const=false -data-sections=false < %s | \ ; RUN: FileCheck %s -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64-ibm-aix-xcoff -data-sections=false < %s | \ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64-ibm-aix-xcoff \ +; RUN: -ppc-merge-string-pool=true -global-merge-all-const=false -data-sections=false < %s | \ ; RUN: FileCheck %s @keep_this = internal constant [5 x i8] c"keep1", align 1 diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir index dfbf412a93921..cdd6be56b46d5 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir @@ -604,7 +604,7 @@ body: | %2 = LI 48 %5 = COPY %0.sub_32 %8 = SRW killed %5, killed %2 - ; CHECK: LI 0 + ; CHECK: LI8 0 ; CHECK-LATE: li 3, 0 $x3 = EXTSW_32_64 %8 BLR8 implicit $lr8, implicit $rm, implicit $x3 @@ -722,7 +722,7 @@ body: | %3 = COPY %0.sub_32 %4 = SRAW killed %3, killed %2, implicit-def dead $carry ; CHECK: LI 48 - ; CHECK: SRAW killed %3, killed %2, implicit-def dead $carry + ; CHECK: SRAW8 killed %7, killed %9, implicit-def $carry, implicit-def dead $carry ; CHECK-LATE: sraw 3, 3, 4 %5 = EXTSW_32_64 killed %4 $x3 = COPY %5 @@ -779,7 +779,7 @@ body: | %2 = LI 80 %3 = COPY %0.sub_32 %4 = SRAW_rec killed %3, %2, implicit-def dead $carry, implicit-def $cr0 - ; CHECK: SRAW_rec killed %3, %2, implicit-def dead $carry, implicit-def $cr0 + ; CHECK: SRAW8_rec killed %10, killed %12, implicit-def $carry, implicit-def $cr0, implicit-def dead $carry, implicit-def $cr0 ; CHECK-LATE: sraw. 3, 3, 4 %5 = COPY killed $cr0 %6 = ISEL %2, %4, %5.sub_eq diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir index 761316ed7726d..fa06dd551a0d4 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir @@ -1348,7 +1348,7 @@ body: | %1 = LI 77 %2 = ADDI killed %1, 44 %3 = EXTSW_32_64 killed %2 - ; CHECK: LI 121 + ; CHECK: LI8 121 ; CHECK-LATE: li 3, 121 $x3 = COPY %3 BLR8 implicit $lr8, implicit $rm, implicit $x3 @@ -3573,7 +3573,7 @@ body: | %0 = LI 777 %1 = ORI %0, 88 - ; CHECK: LI 857 + ; CHECK: LI8 857 ; CHECK-LATE: li 3, 857 $x3 = EXTSW_32_64 %1 BLR8 implicit $lr8, implicit $rm, implicit $x3 @@ -4145,7 +4145,7 @@ body: | %3 = IMPLICIT_DEF %2 = LI 17 %4 = RLWINM killed %2, 4, 20, 27 - ; CHECK: LI 272 + ; CHECK: LI8 272 ; CHECK-LATE: li 3, 272 $x3 = EXTSW_32_64 %4 BLR8 implicit $lr8, implicit $rm, implicit $x3 @@ -4990,7 +4990,10 @@ body: | %2 = LI 15 %3 = COPY %0.sub_32 %4 = SRAW killed %3, killed %2, implicit-def dead $carry - ; CHECK: SRAWI killed %3, 15, implicit-def dead $carry + ; CHECK: %6:g8rc = IMPLICIT_DEF + ; CHECK-NEXT: %7:g8rc = INSERT_SUBREG %6, killed %3, %subreg.sub_32 + ; CHECK-NEXT: %8:g8rc = SRAWI8 killed %7, 15, implicit-def $carry, implicit-def dead $carry + ; CHECK-NEXT: %4:gprc = COPY killed %8.sub_32 ; CHECK-LATE: srawi 3, 3, 15 %5 = EXTSW_32_64 killed %4 $x3 = COPY %5 @@ -5047,7 +5050,7 @@ body: | %2 = LI 8 %3 = COPY %0.sub_32 %4 = SRAW_rec killed %3, %2, implicit-def dead $carry, implicit-def $cr0 - ; CHECK: SRAWI_rec killed %3, 8, implicit-def dead $carry, implicit-def $cr0 + ; CHECK: %11:g8rc = SRAWI8_rec killed %10, 8, implicit-def $carry, implicit-def $cr0, implicit-def dead $carry, implicit-def $cr0 ; CHECK-LATE: srawi. 3, 3, 8 %5 = COPY killed $cr0 %6 = ISEL %2, %4, %5.sub_eq @@ -6456,7 +6459,7 @@ body: | %0 = LI 871 %1 = XORI %0, 17 - ; CHECK: LI 886 + ; CHECK: LI8 886 ; CHECK-LATE: li 3, 886 $x3 = EXTSW_32_64 %1 BLR8 implicit $lr8, implicit $rm, implicit $x3 diff --git a/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll b/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll index 4710d5c14e5b1..4969aec0a1494 100644 --- a/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll +++ b/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll @@ -12,19 +12,11 @@ target triple = "powerpc-ibm-aix" ; CHECK-NEXT: L..__llvm_covinit_functions: ; CHECK-NEXT: .vbyte 4, __llvm_gcov_writeout[DS] ; CHECK-NEXT: .vbyte 4, __llvm_gcov_reset[DS] +; The first .csect directive below is specifying the content of the csect. +; The second .csect directive below is used to insert the .ref pseudo +; instruction. +; CHECK: .csect __llvm_gcov_ctr_section[RW],3 ; CHECK: .csect __llvm_gcov_ctr_section[RW],3 -; CHECK-NEXT: .lglobl __llvm_gcov_ctr # @_MergedGlobals -; CHECK-NEXT: .lglobl __llvm_gcov_ctr.1 -; CHECK-NEXT: .align 3 -; CHECK-NEXT: L.._MergedGlobals: -; CHECK-NEXT: __llvm_gcov_ctr: -; CHECK-NEXT: .space 16 -; CHECK-NEXT: __llvm_gcov_ctr.1: -; CHECK-NEXT: .extern .llvm_gcda_start_file[PR] -; CHECK-NEXT: .extern .llvm_gcda_emit_function[PR] -; CHECK-NEXT: .extern .llvm_gcda_emit_arcs[PR] -; CHECK-NEXT: .extern .llvm_gcda_summary_info[PR] -; CHECK-NEXT: .extern .llvm_gcda_end_file[PR] ; CHECK-RW-NEXT: .ref __llvm_covinit[RW] ; CHECK-RO-NEXT: .ref __llvm_covinit[RO] diff --git a/llvm/test/CodeGen/PowerPC/merge-private.ll b/llvm/test/CodeGen/PowerPC/merge-private.ll index 0ca706abb275f..d3f2910826423 100644 --- a/llvm/test/CodeGen/PowerPC/merge-private.ll +++ b/llvm/test/CodeGen/PowerPC/merge-private.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr8 \ -; RUN: -ppc-asm-full-reg-names -ppc-global-merge=true < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ ; RUN: --check-prefix=AIX64 ; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr8 \ -; RUN: -ppc-asm-full-reg-names -ppc-global-merge=true < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ ; RUN: --check-prefix=AIX32 ; RUN: llc -verify-machineinstrs -mtriple powerpc64le-unknown-linux -mcpu=pwr8 \ ; RUN: -ppc-asm-full-reg-names -ppc-global-merge=true < %s | FileCheck %s \ diff --git a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-large.ll b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-large.ll index b182763ccc146..27923e47b86b6 100644 --- a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-large.ll +++ b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-large.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr8 \ +; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr8 -ppc-global-merge-max-offset=50000 \ ; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s --check-prefixes=AIX32 -; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr8 \ +; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr8 -ppc-global-merge-max-offset=50000 \ ; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s --check-prefixes=AIX64 ; RUN: llc -verify-machineinstrs -mtriple powerpc64-unknown-linux -mcpu=pwr8 \ ; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s --check-prefixes=LINUX64BE @@ -242,10 +242,8 @@ define dso_local signext i32 @str1() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -64(r1) -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 72(r1) -; AIX32-NEXT: addi r3, r3, -29924 -; AIX32-NEXT: addis r3, r3, 1 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: addi r1, r1, 64 @@ -257,11 +255,8 @@ define dso_local signext i32 @str1() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -112(r1) -; AIX64-NEXT: li r4, 0 -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 128(r1) -; AIX64-NEXT: ori r4, r4, 35612 -; AIX64-NEXT: add r3, r3, r4 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: addi r1, r1, 112 @@ -318,17 +313,16 @@ define dso_local signext i32 @array0() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -96(r1) -; AIX32-NEXT: lis r6, 0 -; AIX32-NEXT: lwz r5, L..C0(r2) # @__ModuleStringPool -; AIX32-NEXT: li r4, 12 +; AIX32-NEXT: lwz r6, L..C0(r2) # @_MergedGlobals +; AIX32-NEXT: li r7, 24 ; AIX32-NEXT: addi r3, r1, 64 +; AIX32-NEXT: li r4, 12 ; AIX32-NEXT: stw r0, 104(r1) -; AIX32-NEXT: ori r7, r6, 35596 -; AIX32-NEXT: rlwimi r4, r3, 0, 30, 27 -; AIX32-NEXT: lxvw4x vs0, r5, r7 -; AIX32-NEXT: stxvw4x vs0, 0, r4 -; AIX32-NEXT: ori r4, r6, 35584 -; AIX32-NEXT: lxvw4x vs0, r5, r4 +; AIX32-NEXT: mr r5, r3 +; AIX32-NEXT: lxvw4x vs0, r6, r7 +; AIX32-NEXT: rlwimi r5, r4, 0, 28, 29 +; AIX32-NEXT: stxvw4x vs0, 0, r5 +; AIX32-NEXT: lxvw4x vs0, r6, r4 ; AIX32-NEXT: stxvw4x vs0, 0, r3 ; AIX32-NEXT: bl .calleeInt[PR] ; AIX32-NEXT: nop @@ -341,15 +335,14 @@ define dso_local signext i32 @array0() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -144(r1) -; AIX64-NEXT: li r3, 0 -; AIX64-NEXT: ld r4, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals +; AIX64-NEXT: li r4, 24 ; AIX64-NEXT: std r0, 160(r1) -; AIX64-NEXT: ori r5, r3, 35596 -; AIX64-NEXT: ori r3, r3, 35584 -; AIX64-NEXT: lxvw4x vs0, r4, r5 -; AIX64-NEXT: addi r5, r1, 124 -; AIX64-NEXT: stxvw4x vs0, 0, r5 -; AIX64-NEXT: lxvw4x vs0, r4, r3 +; AIX64-NEXT: lxvw4x vs0, r3, r4 +; AIX64-NEXT: addi r4, r1, 124 +; AIX64-NEXT: stxvw4x vs0, 0, r4 +; AIX64-NEXT: li r4, 12 +; AIX64-NEXT: lxvw4x vs0, r3, r4 ; AIX64-NEXT: addi r3, r1, 112 ; AIX64-NEXT: stxvw4x vs0, 0, r3 ; AIX64-NEXT: bl .calleeInt[PR] @@ -418,28 +411,35 @@ define dso_local signext i32 @array1() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -176(r1) -; AIX32-NEXT: lwz r4, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r4, L..C0(r2) # @_MergedGlobals +; AIX32-NEXT: li r3, 136 ; AIX32-NEXT: li r5, 96 -; AIX32-NEXT: addi r3, r1, 64 ; AIX32-NEXT: stw r0, 184(r1) -; AIX32-NEXT: lxvw4x vs0, r4, r5 +; AIX32-NEXT: lxvw4x vs0, r4, r3 +; AIX32-NEXT: addi r3, r1, 64 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 80 +; AIX32-NEXT: li r5, 120 ; AIX32-NEXT: lxvw4x vs0, r4, r5 +; AIX32-NEXT: li r5, 80 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 64 +; AIX32-NEXT: li r5, 104 ; AIX32-NEXT: lxvw4x vs0, r4, r5 +; AIX32-NEXT: li r5, 64 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 48 +; AIX32-NEXT: li r5, 88 ; AIX32-NEXT: lxvw4x vs0, r4, r5 +; AIX32-NEXT: li r5, 48 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 32 +; AIX32-NEXT: li r5, 72 ; AIX32-NEXT: lxvw4x vs0, r4, r5 +; AIX32-NEXT: li r5, 32 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 16 +; AIX32-NEXT: li r5, 56 ; AIX32-NEXT: lxvw4x vs0, r4, r5 +; AIX32-NEXT: li r5, 16 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: lxvw4x vs0, 0, r4 +; AIX32-NEXT: li r5, 40 +; AIX32-NEXT: lxvw4x vs0, r4, r5 ; AIX32-NEXT: stxvw4x vs0, 0, r3 ; AIX32-NEXT: bl .calleeInt[PR] ; AIX32-NEXT: nop @@ -452,28 +452,35 @@ define dso_local signext i32 @array1() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -224(r1) -; AIX64-NEXT: ld r4, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r4, L..C0(r2) # @_MergedGlobals +; AIX64-NEXT: li r3, 136 ; AIX64-NEXT: li r5, 96 -; AIX64-NEXT: addi r3, r1, 112 ; AIX64-NEXT: std r0, 240(r1) -; AIX64-NEXT: lxvw4x vs0, r4, r5 +; AIX64-NEXT: lxvw4x vs0, r4, r3 +; AIX64-NEXT: addi r3, r1, 112 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 80 +; AIX64-NEXT: li r5, 120 ; AIX64-NEXT: lxvw4x vs0, r4, r5 +; AIX64-NEXT: li r5, 80 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 64 +; AIX64-NEXT: li r5, 104 ; AIX64-NEXT: lxvw4x vs0, r4, r5 +; AIX64-NEXT: li r5, 64 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 48 +; AIX64-NEXT: li r5, 88 ; AIX64-NEXT: lxvw4x vs0, r4, r5 +; AIX64-NEXT: li r5, 48 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 32 +; AIX64-NEXT: li r5, 72 ; AIX64-NEXT: lxvw4x vs0, r4, r5 +; AIX64-NEXT: li r5, 32 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 16 +; AIX64-NEXT: li r5, 56 ; AIX64-NEXT: lxvw4x vs0, r4, r5 +; AIX64-NEXT: li r5, 16 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: lxvw4x vs0, 0, r4 +; AIX64-NEXT: li r5, 40 +; AIX64-NEXT: lxvw4x vs0, r4, r5 ; AIX64-NEXT: stxvw4x vs0, 0, r3 ; AIX64-NEXT: bl .calleeInt[PR] ; AIX64-NEXT: nop @@ -565,34 +572,34 @@ define dso_local signext i32 @array2() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -176(r1) -; AIX32-NEXT: lwz r4, L..C0(r2) # @__ModuleStringPool -; AIX32-NEXT: li r3, 208 +; AIX32-NEXT: lwz r4, L..C0(r2) # @_MergedGlobals +; AIX32-NEXT: li r3, 248 ; AIX32-NEXT: li r5, 96 ; AIX32-NEXT: stw r0, 184(r1) ; AIX32-NEXT: lxvw4x vs0, r4, r3 ; AIX32-NEXT: addi r3, r1, 64 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 192 +; AIX32-NEXT: li r5, 232 ; AIX32-NEXT: lxvw4x vs0, r4, r5 ; AIX32-NEXT: li r5, 80 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 176 +; AIX32-NEXT: li r5, 216 ; AIX32-NEXT: lxvw4x vs0, r4, r5 ; AIX32-NEXT: li r5, 64 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 160 +; AIX32-NEXT: li r5, 200 ; AIX32-NEXT: lxvw4x vs0, r4, r5 ; AIX32-NEXT: li r5, 48 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 144 +; AIX32-NEXT: li r5, 184 ; AIX32-NEXT: lxvw4x vs0, r4, r5 ; AIX32-NEXT: li r5, 32 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 128 +; AIX32-NEXT: li r5, 168 ; AIX32-NEXT: lxvw4x vs0, r4, r5 ; AIX32-NEXT: li r5, 16 ; AIX32-NEXT: stxvw4x vs0, r3, r5 -; AIX32-NEXT: li r5, 112 +; AIX32-NEXT: li r5, 152 ; AIX32-NEXT: lxvw4x vs0, r4, r5 ; AIX32-NEXT: stxvw4x vs0, 0, r3 ; AIX32-NEXT: bl .calleeInt[PR] @@ -606,34 +613,34 @@ define dso_local signext i32 @array2() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -224(r1) -; AIX64-NEXT: ld r4, L..C0(r2) # @__ModuleStringPool -; AIX64-NEXT: li r3, 208 +; AIX64-NEXT: ld r4, L..C0(r2) # @_MergedGlobals +; AIX64-NEXT: li r3, 248 ; AIX64-NEXT: li r5, 96 ; AIX64-NEXT: std r0, 240(r1) ; AIX64-NEXT: lxvw4x vs0, r4, r3 ; AIX64-NEXT: addi r3, r1, 112 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 192 +; AIX64-NEXT: li r5, 232 ; AIX64-NEXT: lxvw4x vs0, r4, r5 ; AIX64-NEXT: li r5, 80 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 176 +; AIX64-NEXT: li r5, 216 ; AIX64-NEXT: lxvw4x vs0, r4, r5 ; AIX64-NEXT: li r5, 64 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 160 +; AIX64-NEXT: li r5, 200 ; AIX64-NEXT: lxvw4x vs0, r4, r5 ; AIX64-NEXT: li r5, 48 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 144 +; AIX64-NEXT: li r5, 184 ; AIX64-NEXT: lxvw4x vs0, r4, r5 ; AIX64-NEXT: li r5, 32 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 128 +; AIX64-NEXT: li r5, 168 ; AIX64-NEXT: lxvw4x vs0, r4, r5 ; AIX64-NEXT: li r5, 16 ; AIX64-NEXT: stxvw4x vs0, r3, r5 -; AIX64-NEXT: li r5, 112 +; AIX64-NEXT: li r5, 152 ; AIX64-NEXT: lxvw4x vs0, r4, r5 ; AIX64-NEXT: stxvw4x vs0, 0, r3 ; AIX64-NEXT: bl .calleeInt[PR] @@ -740,12 +747,12 @@ define dso_local signext i32 @array3() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -224(r1) -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 232(r1) ; AIX32-NEXT: stw r31, 220(r1) # 4-byte Folded Spill ; AIX32-NEXT: addi r31, r1, 56 ; AIX32-NEXT: li r5, 160 -; AIX32-NEXT: addi r4, r3, 224 +; AIX32-NEXT: addi r4, r3, 264 ; AIX32-NEXT: mr r3, r31 ; AIX32-NEXT: bl .___memmove[PR] ; AIX32-NEXT: nop @@ -762,12 +769,12 @@ define dso_local signext i32 @array3() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -288(r1) -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 304(r1) ; AIX64-NEXT: std r31, 280(r1) # 8-byte Folded Spill ; AIX64-NEXT: addi r31, r1, 120 ; AIX64-NEXT: li r5, 160 -; AIX64-NEXT: addi r4, r3, 224 +; AIX64-NEXT: addi r4, r3, 264 ; AIX64-NEXT: mr r3, r31 ; AIX64-NEXT: bl .___memmove64[PR] ; AIX64-NEXT: nop @@ -839,12 +846,12 @@ define dso_local signext i32 @array4() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -384(r1) -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 392(r1) ; AIX32-NEXT: stw r31, 380(r1) # 4-byte Folded Spill ; AIX32-NEXT: addi r31, r1, 56 ; AIX32-NEXT: li r5, 320 -; AIX32-NEXT: addi r4, r3, 384 +; AIX32-NEXT: addi r4, r3, 424 ; AIX32-NEXT: mr r3, r31 ; AIX32-NEXT: bl .___memmove[PR] ; AIX32-NEXT: nop @@ -861,12 +868,12 @@ define dso_local signext i32 @array4() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -448(r1) -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 464(r1) ; AIX64-NEXT: std r31, 440(r1) # 8-byte Folded Spill ; AIX64-NEXT: addi r31, r1, 120 ; AIX64-NEXT: li r5, 320 -; AIX64-NEXT: addi r4, r3, 384 +; AIX64-NEXT: addi r4, r3, 424 ; AIX64-NEXT: mr r3, r31 ; AIX64-NEXT: bl .___memmove64[PR] ; AIX64-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-tls.ll b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-tls.ll index fde135246c2a9..1f2ee88ea7645 100644 --- a/llvm/test/CodeGen/PowerPC/mergeable-string-pool-tls.ll +++ b/llvm/test/CodeGen/PowerPC/mergeable-string-pool-tls.ll @@ -1,9 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff \ -; RUN: -ppc-asm-full-reg-names -enable-global-merge=false < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ ; RUN: --check-prefix=CHECK64 ; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff \ -; RUN: -ppc-asm-full-reg-names -enable-global-merge=false < %s | FileCheck %s \ +; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ ; RUN: --check-prefix=CHECK32 ; RUN: llc -verify-machineinstrs -mtriple powerpc64le-unknown-linux \ ; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s \ @@ -31,14 +30,13 @@ define void @print_tls_func() { ; CHECK64-NEXT: stdu r1, -112(r1) ; CHECK64-NEXT: ld r3, L..C0(r2) # target-flags(ppc-tlsldm) @"_$TLSML" ; CHECK64-NEXT: std r0, 128(r1) -; CHECK64-NEXT: ld r6, L..C1(r2) # @__ModuleStringPool +; CHECK64-NEXT: ld r6, L..C1(r2) # @_MergedGlobals ; CHECK64-NEXT: bla .__tls_get_mod[PR] ; CHECK64-NEXT: ld r4, L..C2(r2) # target-flags(ppc-tlsld) @a ; CHECK64-NEXT: ld r5, L..C3(r2) # target-flags(ppc-tlsld) @b ; CHECK64-NEXT: add r4, r3, r4 ; CHECK64-NEXT: add r5, r3, r5 -; CHECK64-NEXT: addi r3, r6, 71 -; CHECK64-NEXT: addi r6, r6, 56 +; CHECK64-NEXT: addi r3, r6, 72 ; CHECK64-NEXT: bl .printf[PR] ; CHECK64-NEXT: nop ; CHECK64-NEXT: addi r1, r1, 112 @@ -52,14 +50,13 @@ define void @print_tls_func() { ; CHECK32-NEXT: stwu r1, -64(r1) ; CHECK32-NEXT: lwz r3, L..C0(r2) # target-flags(ppc-tlsldm) @"_$TLSML" ; CHECK32-NEXT: stw r0, 72(r1) -; CHECK32-NEXT: lwz r6, L..C1(r2) # @__ModuleStringPool +; CHECK32-NEXT: lwz r6, L..C1(r2) # @_MergedGlobals ; CHECK32-NEXT: bla .__tls_get_mod[PR] ; CHECK32-NEXT: lwz r4, L..C2(r2) # target-flags(ppc-tlsld) @a ; CHECK32-NEXT: lwz r5, L..C3(r2) # target-flags(ppc-tlsld) @b ; CHECK32-NEXT: add r4, r3, r4 ; CHECK32-NEXT: add r5, r3, r5 -; CHECK32-NEXT: addi r3, r6, 71 -; CHECK32-NEXT: addi r6, r6, 56 +; CHECK32-NEXT: addi r3, r6, 72 ; CHECK32-NEXT: bl .printf[PR] ; CHECK32-NEXT: nop ; CHECK32-NEXT: addi r1, r1, 64 @@ -134,9 +131,9 @@ define void @test_func() { ; CHECK64: # %bb.0: # %entry ; CHECK64-NEXT: mflr r0 ; CHECK64-NEXT: stdu r1, -112(r1) -; CHECK64-NEXT: ld r3, L..C1(r2) # @__ModuleStringPool +; CHECK64-NEXT: ld r3, L..C1(r2) # @_MergedGlobals ; CHECK64-NEXT: std r0, 128(r1) -; CHECK64-NEXT: addi r3, r3, 16 +; CHECK64-NEXT: addi r3, r3, 32 ; CHECK64-NEXT: bl .callee[PR] ; CHECK64-NEXT: nop ; CHECK64-NEXT: addi r1, r1, 112 @@ -148,9 +145,9 @@ define void @test_func() { ; CHECK32: # %bb.0: # %entry ; CHECK32-NEXT: mflr r0 ; CHECK32-NEXT: stwu r1, -64(r1) -; CHECK32-NEXT: lwz r3, L..C1(r2) # @__ModuleStringPool +; CHECK32-NEXT: lwz r3, L..C1(r2) # @_MergedGlobals ; CHECK32-NEXT: stw r0, 72(r1) -; CHECK32-NEXT: addi r3, r3, 16 +; CHECK32-NEXT: addi r3, r3, 32 ; CHECK32-NEXT: bl .callee[PR] ; CHECK32-NEXT: nop ; CHECK32-NEXT: addi r1, r1, 64 @@ -201,8 +198,9 @@ define void @test_func2() { ; CHECK64: # %bb.0: # %entry ; CHECK64-NEXT: mflr r0 ; CHECK64-NEXT: stdu r1, -112(r1) -; CHECK64-NEXT: ld r3, L..C1(r2) # @__ModuleStringPool +; CHECK64-NEXT: ld r3, L..C1(r2) # @_MergedGlobals ; CHECK64-NEXT: std r0, 128(r1) +; CHECK64-NEXT: addi r3, r3, 16 ; CHECK64-NEXT: bl .callee2[PR] ; CHECK64-NEXT: nop ; CHECK64-NEXT: addi r1, r1, 112 @@ -214,8 +212,9 @@ define void @test_func2() { ; CHECK32: # %bb.0: # %entry ; CHECK32-NEXT: mflr r0 ; CHECK32-NEXT: stwu r1, -64(r1) -; CHECK32-NEXT: lwz r3, L..C1(r2) # @__ModuleStringPool +; CHECK32-NEXT: lwz r3, L..C1(r2) # @_MergedGlobals ; CHECK32-NEXT: stw r0, 72(r1) +; CHECK32-NEXT: addi r3, r3, 16 ; CHECK32-NEXT: bl .callee2[PR] ; CHECK32-NEXT: nop ; CHECK32-NEXT: addi r1, r1, 64 @@ -268,13 +267,17 @@ entry: ; CHECK64: .csect b[TL],2 ; CHECK64-NEXT: .lglobl b[TL] ; CHECK64-NEXT: .string "tls2" -; CHECK64: .csect L..__ModuleStringPool[RO],2 -; CHECK64-NEXT: .align 2 -; CHECK64-NEXT: .vbyte 4, 0x42af999a +; CHECK64: .csect L.._MergedGlobals[RO],2 +; CHECK64: .align 2 +; CHECK64-LABEL: c: +; CHECK64: .string "Regular global" +; CHECK64-LABEL: e: +; CHECK64: .vbyte 4, 0x42af999a ; CHECK64-NEXT: .vbyte 4, 0x42b1999a ; CHECK64-NEXT: .vbyte 4, 0x42b3cccd ; CHECK64-NEXT: .vbyte 4, 0x42b5999a -; CHECK64-NEXT: .vbyte 4, 1 +; CHECK64-LABEL: d: +; CHECK64: .vbyte 4, 1 ; CHECK64-NEXT: .vbyte 4, 2 ; CHECK64-NEXT: .vbyte 4, 3 ; CHECK64-NEXT: .vbyte 4, 4 @@ -284,10 +287,10 @@ entry: ; CHECK64-NEXT: .vbyte 4, 8 ; CHECK64-NEXT: .vbyte 4, 9 ; CHECK64-NEXT: .vbyte 4, 10 -; CHECK64-NEXT: .string "Regular global" -; CHECK64-NEXT: .byte 'T,'L,'S,' ,'v,'a,'r,'i,'a,'b,'l,'e,' ,'1,',,' ,'2,' ,'a,'n,'d,' ,'n,'o,'n,'-,'T,'L,'S,' ,'v,'a,'r,':,' ,'%,'s,',,' ,'%,'s,',,' ,'%,'s,0012,0000 +; CHECK64-LABEL: L...str +; CHECK64: .byte 'T,'L,'S,' ,'v,'a,'r,'i,'a,'b,'l,'e,' ,'1,',,' ,'2,' ,'a,'n,'d,' ,'n,'o,'n,'-,'T,'L,'S,' ,'v,'a,'r,':,' ,'%,'s,',,' ,'%,'s,',,' ,'%,'s,0012,0000 ; CHECK64: L..C1: -; CHECK64-NEXT: .tc L..__ModuleStringPool[TC],L..__ModuleStringPool[RO] +; CHECK64-NEXT: .tc L.._MergedGlobals[TC],L.._MergedGlobals[RO] ; CHECK64: L..C2: ; CHECK64-NEXT: .tc a[TC],a[TL]@ld ; CHECK64: L..C3: @@ -299,13 +302,17 @@ entry: ; CHECK32: .csect b[TL],2 ; CHECK32-NEXT: .lglobl b[TL] ; CHECK32-NEXT: .string "tls2" -; CHECK32: .csect L..__ModuleStringPool[RO],2 -; CHECK32-NEXT: .align 2 -; CHECK32-NEXT: .vbyte 4, 0x42af999a +; CHECK32: .csect L.._MergedGlobals[RO],2 +; CHECK32: .align 2 +; CHECK32-LABEL: c: +; CHECK32: .string "Regular global" +; CHECK32-LABEL: e: +; CHECK32: .vbyte 4, 0x42af999a ; CHECK32-NEXT: .vbyte 4, 0x42b1999a ; CHECK32-NEXT: .vbyte 4, 0x42b3cccd ; CHECK32-NEXT: .vbyte 4, 0x42b5999a -; CHECK32-NEXT: .vbyte 4, 1 +; CHECK32-LABEL: d: +; CHECK32: .vbyte 4, 1 ; CHECK32-NEXT: .vbyte 4, 2 ; CHECK32-NEXT: .vbyte 4, 3 ; CHECK32-NEXT: .vbyte 4, 4 @@ -315,10 +322,10 @@ entry: ; CHECK32-NEXT: .vbyte 4, 8 ; CHECK32-NEXT: .vbyte 4, 9 ; CHECK32-NEXT: .vbyte 4, 10 -; CHECK32-NEXT: .string "Regular global" -; CHECK32-NEXT: .byte 'T,'L,'S,' ,'v,'a,'r,'i,'a,'b,'l,'e,' ,'1,',,' ,'2,' ,'a,'n,'d,' ,'n,'o,'n,'-,'T,'L,'S,' ,'v,'a,'r,':,' ,'%,'s,',,' ,'%,'s,',,' ,'%,'s,0012,0000 +; CHECK32-LABEL: L...str: +; CHECK32: .byte 'T,'L,'S,' ,'v,'a,'r,'i,'a,'b,'l,'e,' ,'1,',,' ,'2,' ,'a,'n,'d,' ,'n,'o,'n,'-,'T,'L,'S,' ,'v,'a,'r,':,' ,'%,'s,',,' ,'%,'s,',,' ,'%,'s,0012,0000 ; CHECK32: L..C1: -; CHECK32-NEXT: .tc L..__ModuleStringPool[TC],L..__ModuleStringPool[RO] +; CHECK32-NEXT: .tc L.._MergedGlobals[TC],L.._MergedGlobals[RO] ; CHECK32: L..C2: ; CHECK32-NEXT: .tc a[TC],a[TL]@ld ; CHECK32: L..C3: diff --git a/llvm/test/CodeGen/PowerPC/mergeable-string-pool.ll b/llvm/test/CodeGen/PowerPC/mergeable-string-pool.ll index 833ed9fa65acf..2a937806f4bbf 100644 --- a/llvm/test/CodeGen/PowerPC/mergeable-string-pool.ll +++ b/llvm/test/CodeGen/PowerPC/mergeable-string-pool.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr8 -enable-global-merge=false \ +; RUN: llc -verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff -mcpu=pwr8 \ ; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s --check-prefixes=AIX32,AIXDATA -; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr8 -enable-global-merge=false \ +; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr8 \ ; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s --check-prefixes=AIX64,AIXDATA ; RUN: llc -verify-machineinstrs -mtriple powerpc64-unknown-linux -mcpu=pwr8 \ ; RUN: -ppc-asm-full-reg-names < %s | FileCheck %s --check-prefixes=LINUX64BE,LINUXDATA @@ -40,9 +40,9 @@ define dso_local signext i32 @str1() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -64(r1) -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 72(r1) -; AIX32-NEXT: addi r3, r3, 422 +; AIX32-NEXT: addi r3, r3, 20 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: addi r1, r1, 64 @@ -54,9 +54,9 @@ define dso_local signext i32 @str1() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -112(r1) -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 128(r1) -; AIX64-NEXT: addi r3, r3, 422 +; AIX64-NEXT: addi r3, r3, 20 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: addi r1, r1, 112 @@ -105,9 +105,9 @@ define dso_local signext i32 @str2() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -64(r1) -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 72(r1) -; AIX32-NEXT: addi r3, r3, 388 +; AIX32-NEXT: addi r3, r3, 32 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: addi r1, r1, 64 @@ -119,9 +119,9 @@ define dso_local signext i32 @str2() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -112(r1) -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 128(r1) -; AIX64-NEXT: addi r3, r3, 388 +; AIX64-NEXT: addi r3, r3, 32 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: addi r1, r1, 112 @@ -170,13 +170,13 @@ define dso_local signext i32 @str3() local_unnamed_addr #0 { ; AIX32-NEXT: stwu r1, -64(r1) ; AIX32-NEXT: stw r0, 72(r1) ; AIX32-NEXT: stw r30, 56(r1) # 4-byte Folded Spill -; AIX32-NEXT: lwz r30, L..C0(r2) # @__ModuleStringPool -; AIX32-NEXT: addi r3, r30, 434 +; AIX32-NEXT: lwz r30, L..C0(r2) # @_MergedGlobals +; AIX32-NEXT: addi r3, r30, 44 ; AIX32-NEXT: stw r31, 60(r1) # 4-byte Folded Spill ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: mr r31, r3 -; AIX32-NEXT: addi r3, r30, 388 +; AIX32-NEXT: addi r3, r30, 32 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: add r3, r3, r31 @@ -193,13 +193,13 @@ define dso_local signext i32 @str3() local_unnamed_addr #0 { ; AIX64-NEXT: stdu r1, -128(r1) ; AIX64-NEXT: std r0, 144(r1) ; AIX64-NEXT: std r30, 112(r1) # 8-byte Folded Spill -; AIX64-NEXT: ld r30, L..C0(r2) # @__ModuleStringPool -; AIX64-NEXT: addi r3, r30, 434 +; AIX64-NEXT: ld r30, L..C0(r2) # @_MergedGlobals +; AIX64-NEXT: addi r3, r30, 44 ; AIX64-NEXT: std r31, 120(r1) # 8-byte Folded Spill ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: mr r31, r3 -; AIX64-NEXT: addi r3, r30, 388 +; AIX64-NEXT: addi r3, r30, 32 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: add r3, r3, r31 @@ -272,9 +272,9 @@ define dso_local signext i32 @str4() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -64(r1) -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 72(r1) -; AIX32-NEXT: addi r3, r3, 446 +; AIX32-NEXT: addi r3, r3, 56 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: addi r1, r1, 64 @@ -286,9 +286,9 @@ define dso_local signext i32 @str4() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -112(r1) -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 128(r1) -; AIX64-NEXT: addi r3, r3, 446 +; AIX64-NEXT: addi r3, r3, 56 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: addi r1, r1, 112 @@ -335,9 +335,9 @@ define dso_local signext i32 @str5() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -64(r1) -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 72(r1) -; AIX32-NEXT: addi r3, r3, 493 +; AIX32-NEXT: addi r3, r3, 736 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: addi r1, r1, 64 @@ -349,9 +349,9 @@ define dso_local signext i32 @str5() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -112(r1) -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 128(r1) -; AIX64-NEXT: addi r3, r3, 493 +; AIX64-NEXT: addi r3, r3, 736 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: addi r1, r1, 112 @@ -398,15 +398,15 @@ define dso_local signext i32 @array1() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -96(r1) -; AIX32-NEXT: lwz r5, L..C0(r2) # @__ModuleStringPool -; AIX32-NEXT: li r6, 372 +; AIX32-NEXT: lwz r5, L..C0(r2) # @_MergedGlobals +; AIX32-NEXT: li r6, 308 ; AIX32-NEXT: li r4, 12 ; AIX32-NEXT: addi r3, r1, 64 ; AIX32-NEXT: stw r0, 104(r1) ; AIX32-NEXT: rlwimi r4, r3, 0, 30, 27 ; AIX32-NEXT: lxvw4x vs0, r5, r6 ; AIX32-NEXT: stxvw4x vs0, 0, r4 -; AIX32-NEXT: li r4, 360 +; AIX32-NEXT: li r4, 296 ; AIX32-NEXT: lxvw4x vs0, r5, r4 ; AIX32-NEXT: stxvw4x vs0, 0, r3 ; AIX32-NEXT: bl .calleeInt[PR] @@ -420,13 +420,13 @@ define dso_local signext i32 @array1() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -144(r1) -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool -; AIX64-NEXT: li r4, 372 +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals +; AIX64-NEXT: li r4, 308 ; AIX64-NEXT: std r0, 160(r1) ; AIX64-NEXT: lxvw4x vs0, r3, r4 ; AIX64-NEXT: addi r4, r1, 124 ; AIX64-NEXT: stxvw4x vs0, 0, r4 -; AIX64-NEXT: li r4, 360 +; AIX64-NEXT: li r4, 296 ; AIX64-NEXT: lxvw4x vs0, r3, r4 ; AIX64-NEXT: addi r3, r1, 112 ; AIX64-NEXT: stxvw4x vs0, 0, r3 @@ -666,8 +666,8 @@ define dso_local signext i32 @str7() local_unnamed_addr #0 { ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: mr r31, r3 -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool -; AIX32-NEXT: addi r3, r3, 458 +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals +; AIX32-NEXT: addi r3, r3, 80 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: add r3, r3, r31 @@ -688,8 +688,8 @@ define dso_local signext i32 @str7() local_unnamed_addr #0 { ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: mr r31, r3 -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool -; AIX64-NEXT: addi r3, r3, 458 +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals +; AIX64-NEXT: addi r3, r3, 80 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: add r3, r3, r31 @@ -766,8 +766,8 @@ define dso_local signext i32 @mixed1() local_unnamed_addr #0 { ; AIX32-NEXT: bl .calleeInt[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: mr r31, r3 -; AIX32-NEXT: lwz r3, L..C0(r2) # @__ModuleStringPool -; AIX32-NEXT: addi r3, r3, 400 +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals +; AIX32-NEXT: addi r3, r3, 68 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: add r3, r3, r31 @@ -787,8 +787,8 @@ define dso_local signext i32 @mixed1() local_unnamed_addr #0 { ; AIX64-NEXT: bl .calleeInt[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: mr r31, r3 -; AIX64-NEXT: ld r3, L..C0(r2) # @__ModuleStringPool -; AIX64-NEXT: addi r3, r3, 400 +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals +; AIX64-NEXT: addi r3, r3, 68 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: add r3, r3, r31 @@ -860,15 +860,15 @@ define dso_local signext i32 @mixed2() local_unnamed_addr #0 { ; AIX32-NEXT: stwu r1, -112(r1) ; AIX32-NEXT: stw r0, 120(r1) ; AIX32-NEXT: stw r30, 104(r1) # 4-byte Folded Spill -; AIX32-NEXT: lwz r30, L..C0(r2) # @__ModuleStringPool -; AIX32-NEXT: li r5, 372 +; AIX32-NEXT: lwz r30, L..C0(r2) # @_MergedGlobals +; AIX32-NEXT: li r5, 308 ; AIX32-NEXT: li r4, 12 ; AIX32-NEXT: addi r3, r1, 64 ; AIX32-NEXT: stw r31, 108(r1) # 4-byte Folded Spill ; AIX32-NEXT: rlwimi r4, r3, 0, 30, 27 ; AIX32-NEXT: lxvw4x vs0, r30, r5 ; AIX32-NEXT: stxvw4x vs0, 0, r4 -; AIX32-NEXT: li r4, 360 +; AIX32-NEXT: li r4, 296 ; AIX32-NEXT: lxvw4x vs0, r30, r4 ; AIX32-NEXT: stxvw4x vs0, 0, r3 ; AIX32-NEXT: bl .calleeInt[PR] @@ -878,11 +878,11 @@ define dso_local signext i32 @mixed2() local_unnamed_addr #0 { ; AIX32-NEXT: bl .calleeInt[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: add r31, r3, r31 -; AIX32-NEXT: addi r3, r30, 400 +; AIX32-NEXT: addi r3, r30, 68 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: add r31, r31, r3 -; AIX32-NEXT: addi r3, r30, 473 +; AIX32-NEXT: addi r3, r30, 273 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: add r3, r31, r3 @@ -899,13 +899,13 @@ define dso_local signext i32 @mixed2() local_unnamed_addr #0 { ; AIX64-NEXT: stdu r1, -160(r1) ; AIX64-NEXT: std r0, 176(r1) ; AIX64-NEXT: std r30, 144(r1) # 8-byte Folded Spill -; AIX64-NEXT: ld r30, L..C0(r2) # @__ModuleStringPool -; AIX64-NEXT: li r3, 372 +; AIX64-NEXT: ld r30, L..C0(r2) # @_MergedGlobals +; AIX64-NEXT: li r3, 308 ; AIX64-NEXT: std r31, 152(r1) # 8-byte Folded Spill ; AIX64-NEXT: lxvw4x vs0, r30, r3 ; AIX64-NEXT: addi r3, r1, 124 ; AIX64-NEXT: stxvw4x vs0, 0, r3 -; AIX64-NEXT: li r3, 360 +; AIX64-NEXT: li r3, 296 ; AIX64-NEXT: lxvw4x vs0, r30, r3 ; AIX64-NEXT: addi r3, r1, 112 ; AIX64-NEXT: stxvw4x vs0, 0, r3 @@ -916,11 +916,11 @@ define dso_local signext i32 @mixed2() local_unnamed_addr #0 { ; AIX64-NEXT: bl .calleeInt[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: add r31, r3, r31 -; AIX64-NEXT: addi r3, r30, 400 +; AIX64-NEXT: addi r3, r30, 68 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: add r31, r31, r3 -; AIX64-NEXT: addi r3, r30, 473 +; AIX64-NEXT: addi r3, r30, 273 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: add r3, r31, r3 @@ -1033,8 +1033,9 @@ define dso_local signext i32 @str9() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -64(r1) -; AIX32-NEXT: lwz r3, L..C4(r2) # @.str.9 +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 72(r1) +; AIX32-NEXT: addi r3, r3, 128 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: addi r1, r1, 64 @@ -1046,8 +1047,9 @@ define dso_local signext i32 @str9() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -112(r1) -; AIX64-NEXT: ld r3, L..C4(r2) # @.str.9 +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 128(r1) +; AIX64-NEXT: addi r3, r3, 128 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: addi r1, r1, 112 @@ -1092,8 +1094,9 @@ define dso_local signext i32 @str10() local_unnamed_addr #0 { ; AIX32: # %bb.0: # %entry ; AIX32-NEXT: mflr r0 ; AIX32-NEXT: stwu r1, -64(r1) -; AIX32-NEXT: lwz r3, L..C5(r2) # @.str.10 +; AIX32-NEXT: lwz r3, L..C0(r2) # @_MergedGlobals ; AIX32-NEXT: stw r0, 72(r1) +; AIX32-NEXT: addi r3, r3, 256 ; AIX32-NEXT: bl .callee[PR] ; AIX32-NEXT: nop ; AIX32-NEXT: addi r1, r1, 64 @@ -1105,8 +1108,9 @@ define dso_local signext i32 @str10() local_unnamed_addr #0 { ; AIX64: # %bb.0: # %entry ; AIX64-NEXT: mflr r0 ; AIX64-NEXT: stdu r1, -112(r1) -; AIX64-NEXT: ld r3, L..C5(r2) # @.str.10 +; AIX64-NEXT: ld r3, L..C0(r2) # @_MergedGlobals ; AIX64-NEXT: std r0, 128(r1) +; AIX64-NEXT: addi r3, r3, 256 ; AIX64-NEXT: bl .callee[PR] ; AIX64-NEXT: nop ; AIX64-NEXT: addi r1, r1, 112 @@ -1148,8 +1152,18 @@ entry: attributes #0 = { nounwind } -; AIXDATA: .csect L..__ModuleStringPool[RO],3 -; AIXDATA: .align 3 # @__ModuleStringPool +; AIXDATA: .csect L.._MergedGlobals[RO],7 +; AIXDATA: .align 7 # @_MergedGlobals +; AIXDATA: .string "ABCABCABC" +; AIXDATA: .string "str1_STRING" +; AIXDATA: .string "str2_STRING" +; AIXDATA: .string "str3_STRING" +; AIXDATA: .string "str4_STRING" +; AIXDATA: .string "MixedString" +; AIXDATA: .byte 'S,'t,'a,'t,'i,'c,' ,'G,'l,'o,'b,'a,'l,0012,0000 +; AIXDATA: .string "str9_STRING....." +; AIXDATA: .string "str10_STRING...." +; AIXDATA: .string "Different String 01" ; AIXDATA: .vbyte 4, 5 # 0x5 ; AIXDATA: .vbyte 4, 7 # 0x7 ; AIXDATA: .vbyte 4, 9 # 0x9 @@ -1157,14 +1171,6 @@ attributes #0 = { nounwind } ; AIXDATA: .vbyte 4, 17 # 0x11 ; AIXDATA: .vbyte 4, 1235 # 0x4d3 ; AIXDATA: .vbyte 4, 32 # 0x20 -; AIXDATA: .string "str2_STRING" -; AIXDATA: .string "MixedString" -; AIXDATA: .string "ABCABCABC" -; AIXDATA: .string "str1_STRING" -; AIXDATA: .string "str3_STRING" -; AIXDATA: .string "str4_STRING" -; AIXDATA: .byte 'S,'t,'a,'t,'i,'c,' ,'G,'l,'o,'b,'a,'l,0012,0000 -; AIXDATA: .string "Different String 01" ; AIXDATA: .string "longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_longerstr5_STRING" ; LINUXDATA: .L__ModuleStringPool: diff --git a/llvm/test/CodeGen/PowerPC/peephole-replaceInstr-after-eliminate-extsw.mir b/llvm/test/CodeGen/PowerPC/peephole-replaceInstr-after-eliminate-extsw.mir index 71b1ad5368104..8e3c0862d3cba 100644 --- a/llvm/test/CodeGen/PowerPC/peephole-replaceInstr-after-eliminate-extsw.mir +++ b/llvm/test/CodeGen/PowerPC/peephole-replaceInstr-after-eliminate-extsw.mir @@ -496,7 +496,8 @@ body: | %6:g8rc_and_g8rc_nox0 = EXTSB8 killed %84 %7:gprc = LHZ 6, %64 :: (dereferenceable load (s16) from `ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 3)`, !tbaa !3) %86:gprc_and_gprc_nor0 = LHA 4, %64 :: (dereferenceable load (s16) from `ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 2)`) - ; CHECK: [[VIRREG1:%[0-9]+]]:gprc_and_gprc_nor0 = LHA 6, %64 :: (dereferenceable load (s16) from `ptr getelementptr inbounds ([8 x i16], ptr @shortArray, i64 0, i64 3)`, !tbaa !3) + ; CHECK: [[VIRREG162:%[0-9]+]]:g8rc = LHA8 6, %64 + ; CHECK-NEXT: [[VIRREG150:%[0-9]+]]:gprc_and_gprc_nor0 = COPY killed [[VIRREG162]].sub_32 %87:crrc = CMPW %80, %86 %88:gprc = ISEL %80, %86, %87.sub_lt %89:crrc = CMPLWI killed %88, 0 @@ -544,13 +545,19 @@ body: | %150:gprc_and_gprc_nor0 = EXTSH %7 %151:gprc_and_gprc_nor0 = EXTSH %0 - ; CHECK: [[VIRREG2:%[0-9]+]]:gprc_and_gprc_nor0 = EXTSH killed %0 + ; CHECK: [[VIRREG159:%[0-9]+]]:g8rc = IMPLICIT_DEF + ; CHECK-NEXT: [[VIRREG160:%[0-9]+]]:g8rc = INSERT_SUBREG [[VIRREG159]], %0, %subreg.sub_32 + ; CHECK-NEXT: [[VIRREG161:%[0-9]+]]:g8rc = EXTSH8 killed [[VIRREG160]] + ; CHECK-NEXT: %151:gprc_and_gprc_nor0 = COPY killed [[VIRREG161]].sub_32 + %152:crrc = CMPW %151, %150 %153:gprc = ISEL %151, %150, %152.sub_lt %154:g8rc = EXTSW_32_64 killed %153 - ; CHECK-NEXT: [[VIRREG3:%[0-9]+]]:crrc = CMPW [[VIRREG2]], [[VIRREG1]] - ; CHECK-NEXT: %153:gprc = ISEL killed [[VIRREG2]], killed [[VIRREG1]], killed [[VIRREG3]].sub_lt - ; CHECK-NOT: EXTSW_32_64 + ; CHECK: [[VIRREG165:%[0-9]+]]:g8rc = IMPLICIT_DEF + ; CHECK-NEXT: [[VIRREG166:%[0-9]+]]:g8rc = INSERT_SUBREG [[VIRREG165]], [[VIRREG150]], %subreg.sub_32 + ; CHECK-NEXT: [[VIRREG167:%[0-9]+]]:g8rc = ISEL8 killed %164, killed [[VIRREG166]], %152.sub_lt + ; CHECK-NEXT: %{{[0-9]+}}:gprc = COPY killed [[VIRREG167]].sub_32 + ; CHECK-NOT: EXTSW_32_64 %155:g8rc_and_g8rc_nox0 = LDtoc @computedResultUll, $x2 :: (load (s64) from got) STD %154, 0, killed %155 :: (store (s64) into @computedResultUll, !tbaa !7) ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 diff --git a/llvm/test/CodeGen/PowerPC/sext_elimination.mir b/llvm/test/CodeGen/PowerPC/sext_elimination.mir index e920848a4137c..bf6b9005fcf7f 100644 --- a/llvm/test/CodeGen/PowerPC/sext_elimination.mir +++ b/llvm/test/CodeGen/PowerPC/sext_elimination.mir @@ -41,8 +41,14 @@ body: | ; CHECK: %4:g8rc = EXTSW_32_64 killed %3 ; CHECK: %5:g8rc = INSERT_SUBREG %15, %1, %subreg.sub_32 ; CHECK: %7:g8rc = EXTSW_32_64 killed %6 - ; CHECK: %9:g8rc = INSERT_SUBREG %16, %8, %subreg.sub_32 - ; CHECK: %11:g8rc = INSERT_SUBREG %17, %10, %subreg.sub_32 + ; CHECK: %17:g8rc = INSERT_SUBREG %16, %1, %subreg.sub_32 + ; CHECK-NEXT: %18:g8rc = ORIS8 killed %17, 32767 + ; CHECK-NEXT: %8:gprc = COPY killed %18.sub_32 + ; CHECK: %9:g8rc = INSERT_SUBREG %19, %8, %subreg.sub_32 + ; CHECK: %21:g8rc = INSERT_SUBREG %20, %1, %subreg.sub_32 + ; CHECK-NEXT: %22:g8rc = ORI8 killed %21, 32768 + ; CHECK-NEXT: %10:gprc = COPY killed %22.sub_32 + ; CHECK: %11:g8rc = INSERT_SUBREG %23, %10, %subreg.sub_32 ; CHECK: %14:g8rc = COPY killed %13 %0:g8rc_nox0 = COPY $x3 diff --git a/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll b/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll index 8748767501bd0..bc12276060a8b 100644 --- a/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll +++ b/llvm/test/CodeGen/PowerPC/stack-restore-with-setjmp.ll @@ -19,9 +19,10 @@ define dso_local signext i32 @main(i32 signext %argc, ptr nocapture readnone %ar ; CHECK-NEXT: # kill: def $r3 killed $r3 killed $x3 ; CHECK-NEXT: cmpwi 2, 3, 2 ; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: # kill: def $r4 killed $r4 killed $x4 +; CHECK-NEXT: mr 3, 4 ; CHECK-NEXT: std 0, 800(1) ; CHECK-NEXT: mr 31, 1 -; CHECK-NEXT: mr 3, 4 ; CHECK-NEXT: blt 2, .LBB0_3 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: addi 3, 31, 112 @@ -65,6 +66,7 @@ define dso_local signext i32 @main(i32 signext %argc, ptr nocapture readnone %ar ; BE-NEXT: stdu 1, -800(1) ; BE-NEXT: li 4, 0 ; BE-NEXT: # kill: def $r3 killed $r3 killed $x3 +; BE-NEXT: # kill: def $r4 killed $r4 killed $x4 ; BE-NEXT: cmpwi 2, 3, 2 ; BE-NEXT: mr 3, 4 ; BE-NEXT: std 0, 816(1) diff --git a/llvm/test/CodeGen/RISCV/pr95284.ll b/llvm/test/CodeGen/RISCV/pr95284.ll index 135e128c00bac..82600d8d3df51 100644 --- a/llvm/test/CodeGen/RISCV/pr95284.ll +++ b/llvm/test/CodeGen/RISCV/pr95284.ll @@ -6,19 +6,17 @@ define signext i64 @PR95284(i32 signext %0) { ; RV32I-LABEL: PR95284: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: seqz a1, a0 -; RV32I-NEXT: neg a2, a1 -; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: srli a2, a2, 1 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: slli a1, a1, 31 -; RV32I-NEXT: or a0, a1, a0 -; RV32I-NEXT: addi a0, a0, 1 -; RV32I-NEXT: seqz a1, a0 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: slli a2, a0, 31 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: addi a1, a1, 1 +; RV32I-NEXT: seqz a2, a1 +; RV32I-NEXT: sub a2, a2, a0 +; RV32I-NEXT: andi a0, a1, -2 +; RV32I-NEXT: slli a1, a2, 1 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: andi a0, a0, -2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: PR95284: diff --git a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll index 5ced89c17c420..da477aa2043cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll +++ b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+m,+zfh,+zvfh \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+m,+zvfh \ ; RUN: < %s | FileCheck %s declare <16 x i8> @llvm.vector.extract.v16i8.nxv8i8(, i64 immarg) diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll index 15cff650765ef..f9588ffb5da52 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/cmp-folds.ll b/llvm/test/CodeGen/RISCV/rvv/cmp-folds.ll index 4c40b7c74451d..7a995a8d29f9e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cmp-folds.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cmp-folds.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s define @not_icmp_sle_nxv8i16( %a, %b) { ; CHECK-LABEL: not_icmp_sle_nxv8i16: diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll index 9ea1394a1dd2c..2c9f633b89014 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index d36240e493e41..ed86755d5f48a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll new file mode 100644 index 0000000000000..65df96bef2ad1 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll @@ -0,0 +1,20170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=riscv32 -mattr=+v,+d,+m,+zbb %s -o - \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 +; RUN: llc -verify-machineinstrs -mtriple=riscv64 -mattr=+v,+d,+m,+zbb %s -o - \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64 + +; Load + expand for i8 type + +define <1 x i8> @test_expandload_v1i8(ptr %base, <1 x i1> %mask, <1 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <1 x i8> @llvm.masked.expandload.v1i8(ptr align 1 %base, <1 x i1> %mask, <1 x i8> %passthru) + ret <1 x i8> %res +} + +define <1 x i8> @test_expandload_v1i8_all_ones(ptr %base, <1 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v1i8_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %res = call <1 x i8> @llvm.masked.expandload.v1i8(ptr align 1 %base, <1 x i1> splat (i1 true), <1 x i8> %passthru) + ret <1 x i8> %res +} + +define <2 x i8> @test_expandload_v2i8(ptr %base, <2 x i1> %mask, <2 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <2 x i8> @llvm.masked.expandload.v2i8(ptr align 1 %base, <2 x i1> %mask, <2 x i8> %passthru) + ret <2 x i8> %res +} + +define <2 x i8> @test_expandload_v2i8_all_ones(ptr %base, <2 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v2i8_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %res = call <2 x i8> @llvm.masked.expandload.v2i8(ptr align 1 %base, <2 x i1> splat (i1 true), <2 x i8> %passthru) + ret <2 x i8> %res +} + +define <4 x i8> @test_expandload_v4i8(ptr %base, <4 x i1> %mask, <4 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.masked.expandload.v4i8(ptr align 1 %base, <4 x i1> %mask, <4 x i8> %passthru) + ret <4 x i8> %res +} + +define <4 x i8> @test_expandload_v4i8_all_ones(ptr %base, <4 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v4i8_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %res = call <4 x i8> @llvm.masked.expandload.v4i8(ptr align 1 %base, <4 x i1> splat (i1 true), <4 x i8> %passthru) + ret <4 x i8> %res +} + +define <8 x i8> @test_expandload_v8i8(ptr %base, <8 x i1> %mask, <8 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.masked.expandload.v8i8(ptr align 1 %base, <8 x i1> %mask, <8 x i8> %passthru) + ret <8 x i8> %res +} + +define <8 x i8> @test_expandload_v8i8_all_ones(ptr %base, <8 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v8i8_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.masked.expandload.v8i8(ptr align 1 %base, <8 x i1> splat (i1 true), <8 x i8> %passthru) + ret <8 x i8> %res +} + +define <16 x i8> @test_expandload_v16i8(ptr %base, <16 x i1> %mask, <16 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.masked.expandload.v16i8(ptr align 1 %base, <16 x i1> %mask, <16 x i8> %passthru) + ret <16 x i8> %res +} + +define <16 x i8> @test_expandload_v16i8_all_ones(ptr %base, <16 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v16i8_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.masked.expandload.v16i8(ptr align 1 %base, <16 x i1> splat (i1 true), <16 x i8> %passthru) + ret <16 x i8> %res +} + +define <32 x i8> @test_expandload_v32i8(ptr %base, <32 x i1> %mask, <32 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vcpop.m a2, v0 +; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: viota.m v12, v0 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret + %res = call <32 x i8> @llvm.masked.expandload.v32i8(ptr align 1 %base, <32 x i1> %mask, <32 x i8> %passthru) + ret <32 x i8> %res +} + +define <32 x i8> @test_expandload_v32i8_all_ones(ptr %base, <32 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v32i8_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %res = call <32 x i8> @llvm.masked.expandload.v32i8(ptr align 1 %base, <32 x i1> splat (i1 true), <32 x i8> %passthru) + ret <32 x i8> %res +} + +define <64 x i8> @test_expandload_v64i8(ptr %base, <64 x i1> %mask, <64 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vcpop.m a2, v0 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: viota.m v16, v0 +; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: ret + %res = call <64 x i8> @llvm.masked.expandload.v64i8(ptr align 1 %base, <64 x i1> %mask, <64 x i8> %passthru) + ret <64 x i8> %res +} + +define <64 x i8> @test_expandload_v64i8_all_ones(ptr %base, <64 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v64i8_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %res = call <64 x i8> @llvm.masked.expandload.v64i8(ptr align 1 %base, <64 x i1> splat (i1 true), <64 x i8> %passthru) + ret <64 x i8> %res +} + +define <128 x i8> @test_expandload_v128i8(ptr %base, <128 x i1> %mask, <128 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vcpop.m a2, v0 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: viota.m v24, v0 +; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-NEXT: ret + %res = call <128 x i8> @llvm.masked.expandload.v128i8(ptr align 1 %base, <128 x i1> %mask, <128 x i8> %passthru) + ret <128 x i8> %res +} + +define <128 x i8> @test_expandload_v128i8_all_ones(ptr %base, <128 x i8> %passthru) { +; CHECK-LABEL: test_expandload_v128i8_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 128 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %res = call <128 x i8> @llvm.masked.expandload.v128i8(ptr align 1 %base, <128 x i1> splat (i1 true), <128 x i8> %passthru) + ret <128 x i8> %res +} + +define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8> %passthru) { +; CHECK-RV32-LABEL: test_expandload_v256i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: slli a2, a2, 5 +; CHECK-RV32-NEXT: sub sp, sp, a2 +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: li a3, 24 +; CHECK-RV32-NEXT: mul a2, a2, a3 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: addi a2, a2, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vmv1r.v v7, v8 +; CHECK-RV32-NEXT: li a2, 128 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV32-NEXT: vle8.v v8, (a1) +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: slli a1, a1, 3 +; CHECK-RV32-NEXT: add a1, sp, a1 +; CHECK-RV32-NEXT: addi a1, a1, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v9, v0, 1 +; CHECK-RV32-NEXT: li a1, 32 +; CHECK-RV32-NEXT: vsrl.vx v10, v9, a1 +; CHECK-RV32-NEXT: vmv.x.s a3, v10 +; CHECK-RV32-NEXT: vsrl.vx v10, v0, a1 +; CHECK-RV32-NEXT: vmv.x.s a1, v10 +; CHECK-RV32-NEXT: vmv.x.s a4, v9 +; CHECK-RV32-NEXT: vmv.x.s a5, v0 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV32-NEXT: vcpop.m a6, v0 +; CHECK-RV32-NEXT: vsetvli zero, a6, e8, m8, ta, ma +; CHECK-RV32-NEXT: vle8.v v8, (a0) +; CHECK-RV32-NEXT: csrr a6, vlenb +; CHECK-RV32-NEXT: slli a6, a6, 4 +; CHECK-RV32-NEXT: add a6, sp, a6 +; CHECK-RV32-NEXT: addi a6, a6, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: cpop a1, a1 +; CHECK-RV32-NEXT: cpop a5, a5 +; CHECK-RV32-NEXT: add a1, a5, a1 +; CHECK-RV32-NEXT: cpop a3, a3 +; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: add a3, a4, a3 +; CHECK-RV32-NEXT: add a1, a1, a3 +; CHECK-RV32-NEXT: add a0, a0, a1 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV32-NEXT: vcpop.m a1, v7 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-RV32-NEXT: vle8.v v8, (a0) +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v16, v7 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: add sp, sp, a0 +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_expandload_v256i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: slli a2, a2, 5 +; CHECK-RV64-NEXT: sub sp, sp, a2 +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vmv1r.v v7, v8 +; CHECK-RV64-NEXT: li a2, 128 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV64-NEXT: vle8.v v8, (a1) +; CHECK-RV64-NEXT: addi a1, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v9, v0, 1 +; CHECK-RV64-NEXT: vmv.x.s a1, v9 +; CHECK-RV64-NEXT: vmv.x.s a3, v0 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV64-NEXT: vcpop.m a4, v0 +; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma +; CHECK-RV64-NEXT: vle8.v v24, (a0) +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: slli a4, a4, 4 +; CHECK-RV64-NEXT: add a4, sp, a4 +; CHECK-RV64-NEXT: addi a4, a4, 16 +; CHECK-RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-RV64-NEXT: vcpop.m a4, v7 +; CHECK-RV64-NEXT: cpop a3, a3 +; CHECK-RV64-NEXT: cpop a1, a1 +; CHECK-RV64-NEXT: add a0, a0, a3 +; CHECK-RV64-NEXT: add a0, a0, a1 +; CHECK-RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma +; CHECK-RV64-NEXT: vle8.v v8, (a0) +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v16, v7 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v8, v24, v16, v0.t +; CHECK-RV64-NEXT: vmv.v.v v16, v8 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: add sp, sp, a0 +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret + %res = call <256 x i8> @llvm.masked.expandload.v256i8(ptr align 1 %base, <256 x i1> %mask, <256 x i8> %passthru) + ret <256 x i8> %res +} + +define <256 x i8> @test_expandload_v256i8_all_ones(ptr %base, <256 x i8> %passthru) { +; CHECK-RV32-LABEL: test_expandload_v256i8_all_ones: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a1, 128 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-RV32-NEXT: vmset.m v8 +; CHECK-RV32-NEXT: li a2, 32 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v9, v8, a2 +; CHECK-RV32-NEXT: vmv.x.s a3, v9 +; CHECK-RV32-NEXT: cpop a3, a3 +; CHECK-RV32-NEXT: vmv.x.s a4, v8 +; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: add a3, a4, a3 +; CHECK-RV32-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-RV32-NEXT: vsrl.vx v9, v8, a2 +; CHECK-RV32-NEXT: vmv.x.s a2, v9 +; CHECK-RV32-NEXT: cpop a2, a2 +; CHECK-RV32-NEXT: vmv.x.s a4, v8 +; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: add a2, a4, a2 +; CHECK-RV32-NEXT: add a3, a0, a3 +; CHECK-RV32-NEXT: add a2, a3, a2 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-RV32-NEXT: vle8.v v16, (a2) +; CHECK-RV32-NEXT: vle8.v v8, (a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_expandload_v256i8_all_ones: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: li a1, 128 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-RV64-NEXT: vle8.v v8, (a0) +; CHECK-RV64-NEXT: vmset.m v16 +; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a2, v16 +; CHECK-RV64-NEXT: cpop a2, a2 +; CHECK-RV64-NEXT: vslidedown.vi v16, v16, 1 +; CHECK-RV64-NEXT: vmv.x.s a3, v16 +; CHECK-RV64-NEXT: cpop a3, a3 +; CHECK-RV64-NEXT: add a0, a0, a2 +; CHECK-RV64-NEXT: add a0, a0, a3 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-RV64-NEXT: vle8.v v16, (a0) +; CHECK-RV64-NEXT: ret + %res = call <256 x i8> @llvm.masked.expandload.v256i8(ptr align 1 %base, <256 x i1> splat (i1 true), <256 x i8> %passthru) + ret <256 x i8> %res +} + +; Load + expand for i16 type + +define <1 x i16> @test_expandload_v1i16(ptr %base, <1 x i1> %mask, <1 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <1 x i16> @llvm.masked.expandload.v1i16(ptr align 2 %base, <1 x i1> %mask, <1 x i16> %passthru) + ret <1 x i16> %res +} + +define <1 x i16> @test_expandload_v1i16_all_ones(ptr %base, <1 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v1i16_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <1 x i16> @llvm.masked.expandload.v1i16(ptr align 2 %base, <1 x i1> splat (i1 true), <1 x i16> %passthru) + ret <1 x i16> %res +} + +define <2 x i16> @test_expandload_v2i16(ptr %base, <2 x i1> %mask, <2 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.masked.expandload.v2i16(ptr align 2 %base, <2 x i1> %mask, <2 x i16> %passthru) + ret <2 x i16> %res +} + +define <2 x i16> @test_expandload_v2i16_all_ones(ptr %base, <2 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v2i16_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <2 x i16> @llvm.masked.expandload.v2i16(ptr align 2 %base, <2 x i1> splat (i1 true), <2 x i16> %passthru) + ret <2 x i16> %res +} + +define <4 x i16> @test_expandload_v4i16(ptr %base, <4 x i1> %mask, <4 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <4 x i16> @llvm.masked.expandload.v4i16(ptr align 2 %base, <4 x i1> %mask, <4 x i16> %passthru) + ret <4 x i16> %res +} + +define <4 x i16> @test_expandload_v4i16_all_ones(ptr %base, <4 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v4i16_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <4 x i16> @llvm.masked.expandload.v4i16(ptr align 2 %base, <4 x i1> splat (i1 true), <4 x i16> %passthru) + ret <4 x i16> %res +} + +define <8 x i16> @test_expandload_v8i16(ptr %base, <8 x i1> %mask, <8 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.masked.expandload.v8i16(ptr align 2 %base, <8 x i1> %mask, <8 x i16> %passthru) + ret <8 x i16> %res +} + +define <8 x i16> @test_expandload_v8i16_all_ones(ptr %base, <8 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v8i16_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <8 x i16> @llvm.masked.expandload.v8i16(ptr align 2 %base, <8 x i1> splat (i1 true), <8 x i16> %passthru) + ret <8 x i16> %res +} + +define <16 x i16> @test_expandload_v16i16(ptr %base, <16 x i1> %mask, <16 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: viota.m v12, v0 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.masked.expandload.v16i16(ptr align 2 %base, <16 x i1> %mask, <16 x i16> %passthru) + ret <16 x i16> %res +} + +define <16 x i16> @test_expandload_v16i16_all_ones(ptr %base, <16 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v16i16_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.masked.expandload.v16i16(ptr align 2 %base, <16 x i1> splat (i1 true), <16 x i16> %passthru) + ret <16 x i16> %res +} + +define <32 x i16> @test_expandload_v32i16(ptr %base, <32 x i1> %mask, <32 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vcpop.m a2, v0 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: viota.m v16, v0 +; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: ret + %res = call <32 x i16> @llvm.masked.expandload.v32i16(ptr align 2 %base, <32 x i1> %mask, <32 x i16> %passthru) + ret <32 x i16> %res +} + +define <32 x i16> @test_expandload_v32i16_all_ones(ptr %base, <32 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v32i16_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <32 x i16> @llvm.masked.expandload.v32i16(ptr align 2 %base, <32 x i1> splat (i1 true), <32 x i16> %passthru) + ret <32 x i16> %res +} + +define <64 x i16> @test_expandload_v64i16(ptr %base, <64 x i1> %mask, <64 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vcpop.m a2, v0 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: viota.m v24, v0 +; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-NEXT: ret + %res = call <64 x i16> @llvm.masked.expandload.v64i16(ptr align 2 %base, <64 x i1> %mask, <64 x i16> %passthru) + ret <64 x i16> %res +} + +define <64 x i16> @test_expandload_v64i16_all_ones(ptr %base, <64 x i16> %passthru) { +; CHECK-LABEL: test_expandload_v64i16_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <64 x i16> @llvm.masked.expandload.v64i16(ptr align 2 %base, <64 x i1> splat (i1 true), <64 x i16> %passthru) + ret <64 x i16> %res +} + +define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x i16> %passthru) { +; CHECK-RV32-LABEL: test_expandload_v128i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: li a2, 40 +; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: sub sp, sp, a1 +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: li a2, 24 +; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: add a1, sp, a1 +; CHECK-RV32-NEXT: addi a1, a1, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: slli a1, a1, 5 +; CHECK-RV32-NEXT: add a1, sp, a1 +; CHECK-RV32-NEXT: addi a1, a1, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: li a1, 64 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV32-NEXT: vcpop.m a2, v0 +; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v8, (a0) +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: slli a2, a2, 4 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: addi a2, a2, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 8 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV32-NEXT: vcpop.m a2, v7 +; CHECK-RV32-NEXT: li a3, 32 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v25, v0, a3 +; CHECK-RV32-NEXT: vmv.x.s a3, v25 +; CHECK-RV32-NEXT: cpop a3, a3 +; CHECK-RV32-NEXT: vmv.x.s a4, v0 +; CHECK-RV32-NEXT: cpop a4, a4 +; CHECK-RV32-NEXT: add a3, a4, a3 +; CHECK-RV32-NEXT: slli a3, a3, 1 +; CHECK-RV32-NEXT: add a0, a0, a3 +; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v8, (a0) +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV32-NEXT: viota.m v8, v0 +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v16, v7 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-RV32-NEXT: vmv.v.v v16, v8 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 40 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add sp, sp, a0 +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_expandload_v128i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: csrr a1, vlenb +; CHECK-RV64-NEXT: li a2, 40 +; CHECK-RV64-NEXT: mul a1, a1, a2 +; CHECK-RV64-NEXT: sub sp, sp, a1 +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV64-NEXT: csrr a1, vlenb +; CHECK-RV64-NEXT: slli a1, a1, 5 +; CHECK-RV64-NEXT: add a1, sp, a1 +; CHECK-RV64-NEXT: addi a1, a1, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: li a1, 64 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV64-NEXT: vcpop.m a2, v0 +; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-RV64-NEXT: vle16.v v16, (a0) +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 8 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV64-NEXT: vcpop.m a2, v7 +; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a3, v0 +; CHECK-RV64-NEXT: cpop a3, a3 +; CHECK-RV64-NEXT: slli a3, a3, 1 +; CHECK-RV64-NEXT: add a0, a0, a3 +; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-RV64-NEXT: vle16.v v16, (a0) +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v16, v7 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 40 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add sp, sp, a0 +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret + %res = call <128 x i16> @llvm.masked.expandload.v128i16(ptr align 2 %base, <128 x i1> %mask, <128 x i16> %passthru) + ret <128 x i16> %res +} + +define <128 x i16> @test_expandload_v128i16_all_ones(ptr %base, <128 x i16> %passthru) { +; CHECK-RV32-LABEL: test_expandload_v128i16_all_ones: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a1, 64 +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v8, (a0) +; CHECK-RV32-NEXT: vmset.m v16 +; CHECK-RV32-NEXT: li a2, 32 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v17, v16, a2 +; CHECK-RV32-NEXT: vmv.x.s a2, v17 +; CHECK-RV32-NEXT: cpop a2, a2 +; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: cpop a3, a3 +; CHECK-RV32-NEXT: add a2, a3, a2 +; CHECK-RV32-NEXT: slli a2, a2, 1 +; CHECK-RV32-NEXT: add a0, a0, a2 +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV32-NEXT: vle16.v v16, (a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_expandload_v128i16_all_ones: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: li a1, 64 +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV64-NEXT: vle16.v v8, (a0) +; CHECK-RV64-NEXT: vmset.m v16 +; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a2, v16 +; CHECK-RV64-NEXT: cpop a2, a2 +; CHECK-RV64-NEXT: slli a2, a2, 1 +; CHECK-RV64-NEXT: add a0, a0, a2 +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV64-NEXT: vle16.v v16, (a0) +; CHECK-RV64-NEXT: ret + %res = call <128 x i16> @llvm.masked.expandload.v128i16(ptr align 2 %base, <128 x i1> splat (i1 true), <128 x i16> %passthru) + ret <128 x i16> %res +} + +; Load + expand for i32 type + +define <1 x i32> @test_expandload_v1i32(ptr %base, <1 x i1> %mask, <1 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <1 x i32> @llvm.masked.expandload.v1i32(ptr align 4 %base, <1 x i1> %mask, <1 x i32> %passthru) + ret <1 x i32> %res +} + +define <1 x i32> @test_expandload_v1i32_all_ones(ptr %base, <1 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v1i32_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %res = call <1 x i32> @llvm.masked.expandload.v1i32(ptr align 4 %base, <1 x i1> splat (i1 true), <1 x i32> %passthru) + ret <1 x i32> %res +} + +define <2 x i32> @test_expandload_v2i32(ptr %base, <2 x i1> %mask, <2 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <2 x i32> @llvm.masked.expandload.v2i32(ptr align 4 %base, <2 x i1> %mask, <2 x i32> %passthru) + ret <2 x i32> %res +} + +define <2 x i32> @test_expandload_v2i32_all_ones(ptr %base, <2 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v2i32_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %res = call <2 x i32> @llvm.masked.expandload.v2i32(ptr align 4 %base, <2 x i1> splat (i1 true), <2 x i32> %passthru) + ret <2 x i32> %res +} + +define <4 x i32> @test_expandload_v4i32(ptr %base, <4 x i1> %mask, <4 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 %base, <4 x i1> %mask, <4 x i32> %passthru) + ret <4 x i32> %res +} + +define <4 x i32> @test_expandload_v4i32_all_ones(ptr %base, <4 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v4i32_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %res = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 %base, <4 x i1> splat (i1 true), <4 x i32> %passthru) + ret <4 x i32> %res +} + +define <8 x i32> @test_expandload_v8i32(ptr %base, <8 x i1> %mask, <8 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: viota.m v12, v0 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.masked.expandload.v8i32(ptr align 4 %base, <8 x i1> %mask, <8 x i32> %passthru) + ret <8 x i32> %res +} + +define <8 x i32> @test_expandload_v8i32_all_ones(ptr %base, <8 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v8i32_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.masked.expandload.v8i32(ptr align 4 %base, <8 x i1> splat (i1 true), <8 x i32> %passthru) + ret <8 x i32> %res +} + +define <16 x i32> @test_expandload_v16i32(ptr %base, <16 x i1> %mask, <16 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: viota.m v16, v0 +; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: ret + %res = call <16 x i32> @llvm.masked.expandload.v16i32(ptr align 4 %base, <16 x i1> %mask, <16 x i32> %passthru) + ret <16 x i32> %res +} + +define <16 x i32> @test_expandload_v16i32_all_ones(ptr %base, <16 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v16i32_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %res = call <16 x i32> @llvm.masked.expandload.v16i32(ptr align 4 %base, <16 x i1> splat (i1 true), <16 x i32> %passthru) + ret <16 x i32> %res +} + +define <32 x i32> @test_expandload_v32i32(ptr %base, <32 x i1> %mask, <32 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vcpop.m a2, v0 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: viota.m v24, v0 +; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-NEXT: ret + %res = call <32 x i32> @llvm.masked.expandload.v32i32(ptr align 4 %base, <32 x i1> %mask, <32 x i32> %passthru) + ret <32 x i32> %res +} + +define <32 x i32> @test_expandload_v32i32_all_ones(ptr %base, <32 x i32> %passthru) { +; CHECK-LABEL: test_expandload_v32i32_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %res = call <32 x i32> @llvm.masked.expandload.v32i32(ptr align 4 %base, <32 x i1> splat (i1 true), <32 x i32> %passthru) + ret <32 x i32> %res +} + +define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> %passthru) { +; CHECK-RV32-LABEL: test_expandload_v64i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: li a2, 40 +; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: sub sp, sp, a1 +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: slli a1, a1, 5 +; CHECK-RV32-NEXT: add a1, sp, a1 +; CHECK-RV32-NEXT: addi a1, a1, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: li a1, 32 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV32-NEXT: vcpop.m a2, v0 +; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV32-NEXT: vle32.v v16, (a0) +; CHECK-RV32-NEXT: csrr a2, vlenb +; CHECK-RV32-NEXT: li a3, 24 +; CHECK-RV32-NEXT: mul a2, a2, a3 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: addi a2, a2, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 4 +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV32-NEXT: vcpop.m a2, v7 +; CHECK-RV32-NEXT: vmv.x.s a3, v0 +; CHECK-RV32-NEXT: cpop a3, a3 +; CHECK-RV32-NEXT: slli a3, a3, 2 +; CHECK-RV32-NEXT: add a0, a0, a3 +; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV32-NEXT: vle32.v v16, (a0) +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v16, v7 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 40 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add sp, sp, a0 +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_expandload_v64i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: csrr a1, vlenb +; CHECK-RV64-NEXT: li a2, 40 +; CHECK-RV64-NEXT: mul a1, a1, a2 +; CHECK-RV64-NEXT: sub sp, sp, a1 +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV64-NEXT: csrr a1, vlenb +; CHECK-RV64-NEXT: slli a1, a1, 5 +; CHECK-RV64-NEXT: add a1, sp, a1 +; CHECK-RV64-NEXT: addi a1, a1, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: li a1, 32 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV64-NEXT: vcpop.m a2, v0 +; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV64-NEXT: vle32.v v16, (a0) +; CHECK-RV64-NEXT: csrr a2, vlenb +; CHECK-RV64-NEXT: li a3, 24 +; CHECK-RV64-NEXT: mul a2, a2, a3 +; CHECK-RV64-NEXT: add a2, sp, a2 +; CHECK-RV64-NEXT: addi a2, a2, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 4 +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV64-NEXT: vcpop.m a2, v7 +; CHECK-RV64-NEXT: vmv.x.s a3, v0 +; CHECK-RV64-NEXT: cpopw a3, a3 +; CHECK-RV64-NEXT: slli a3, a3, 2 +; CHECK-RV64-NEXT: add a0, a0, a3 +; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV64-NEXT: vle32.v v16, (a0) +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v16, v7 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 40 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add sp, sp, a0 +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret + %res = call <64 x i32> @llvm.masked.expandload.v64i32(ptr align 4 %base, <64 x i1> %mask, <64 x i32> %passthru) + ret <64 x i32> %res +} + +define <64 x i32> @test_expandload_v64i32_all_ones(ptr %base, <64 x i32> %passthru) { +; CHECK-RV32-LABEL: test_expandload_v64i32_all_ones: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a1, 32 +; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV32-NEXT: vle32.v v8, (a0) +; CHECK-RV32-NEXT: vmset.m v16 +; CHECK-RV32-NEXT: vmv.x.s a1, v16 +; CHECK-RV32-NEXT: cpop a1, a1 +; CHECK-RV32-NEXT: slli a1, a1, 2 +; CHECK-RV32-NEXT: add a0, a0, a1 +; CHECK-RV32-NEXT: vle32.v v16, (a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_expandload_v64i32_all_ones: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: li a1, 32 +; CHECK-RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-RV64-NEXT: vle32.v v8, (a0) +; CHECK-RV64-NEXT: addi a0, a0, 128 +; CHECK-RV64-NEXT: vle32.v v16, (a0) +; CHECK-RV64-NEXT: ret + %res = call <64 x i32> @llvm.masked.expandload.v64i32(ptr align 4 %base, <64 x i1> splat (i1 true), <64 x i32> %passthru) + ret <64 x i32> %res +} + +; Load + expand for i64 type + +define <1 x i64> @test_expandload_v1i64(ptr %base, <1 x i1> %mask, <1 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <1 x i64> @llvm.masked.expandload.v1i64(ptr align 8 %base, <1 x i1> %mask, <1 x i64> %passthru) + ret <1 x i64> %res +} + +define <1 x i64> @test_expandload_v1i64_all_ones(ptr %base, <1 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v1i64_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: ret + %res = call <1 x i64> @llvm.masked.expandload.v1i64(ptr align 8 %base, <1 x i1> splat (i1 true), <1 x i64> %passthru) + ret <1 x i64> %res +} + +define <2 x i64> @test_expandload_v2i64(ptr %base, <2 x i1> %mask, <2 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 %base, <2 x i1> %mask, <2 x i64> %passthru) + ret <2 x i64> %res +} + +define <2 x i64> @test_expandload_v2i64_all_ones(ptr %base, <2 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v2i64_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: ret + %res = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 %base, <2 x i1> splat (i1 true), <2 x i64> %passthru) + ret <2 x i64> %res +} + +define <4 x i64> @test_expandload_v4i64(ptr %base, <4 x i1> %mask, <4 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: viota.m v12, v0 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret + %res = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 %base, <4 x i1> %mask, <4 x i64> %passthru) + ret <4 x i64> %res +} + +define <4 x i64> @test_expandload_v4i64_all_ones(ptr %base, <4 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v4i64_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: ret + %res = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 %base, <4 x i1> splat (i1 true), <4 x i64> %passthru) + ret <4 x i64> %res +} + +define <8 x i64> @test_expandload_v8i64(ptr %base, <8 x i1> %mask, <8 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64.v v12, (a0) +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; CHECK-NEXT: viota.m v16, v0 +; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: ret + %res = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 %base, <8 x i1> %mask, <8 x i64> %passthru) + ret <8 x i64> %res +} + +define <8 x i64> @test_expandload_v8i64_all_ones(ptr %base, <8 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v8i64_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: ret + %res = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 %base, <8 x i1> splat (i1 true), <8 x i64> %passthru) + ret <8 x i64> %res +} + +define <16 x i64> @test_expandload_v16i64(ptr %base, <16 x i1> %mask, <16 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a0) +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-NEXT: viota.m v24, v0 +; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-NEXT: ret + %res = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 %base, <16 x i1> %mask, <16 x i64> %passthru) + ret <16 x i64> %res +} + +define <16 x i64> @test_expandload_v16i64_all_ones(ptr %base, <16 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v16i64_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: ret + %res = call <16 x i64> @llvm.masked.expandload.v16i64(ptr align 8 %base, <16 x i1> splat (i1 true), <16 x i64> %passthru) + ret <16 x i64> %res +} + +define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64> %passthru) { +; CHECK-RV32-LABEL: test_expandload_v32i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: li a2, 40 +; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: sub sp, sp, a1 +; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: slli a1, a1, 5 +; CHECK-RV32-NEXT: add a1, sp, a1 +; CHECK-RV32-NEXT: addi a1, a1, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32-NEXT: vcpop.m a1, v0 +; CHECK-RV32-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-RV32-NEXT: vle64.v v16, (a0) +; CHECK-RV32-NEXT: csrr a1, vlenb +; CHECK-RV32-NEXT: li a2, 24 +; CHECK-RV32-NEXT: mul a1, a1, a2 +; CHECK-RV32-NEXT: add a1, sp, a1 +; CHECK-RV32-NEXT: addi a1, a1, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vmv.x.s a1, v0 +; CHECK-RV32-NEXT: zext.h a1, a1 +; CHECK-RV32-NEXT: cpop a1, a1 +; CHECK-RV32-NEXT: slli a1, a1, 3 +; CHECK-RV32-NEXT: add a0, a0, a1 +; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32-NEXT: vcpop.m a1, v7 +; CHECK-RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-RV32-NEXT: vle64.v v16, (a0) +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV32-NEXT: viota.m v16, v0 +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: addi a0, sp, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: viota.m v16, v7 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 5 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: li a1, 40 +; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: add sp, sp, a0 +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_expandload_v32i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: csrr a1, vlenb +; CHECK-RV64-NEXT: li a2, 40 +; CHECK-RV64-NEXT: mul a1, a1, a2 +; CHECK-RV64-NEXT: sub sp, sp, a1 +; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-RV64-NEXT: csrr a1, vlenb +; CHECK-RV64-NEXT: slli a1, a1, 5 +; CHECK-RV64-NEXT: add a1, sp, a1 +; CHECK-RV64-NEXT: addi a1, a1, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64-NEXT: vcpop.m a1, v0 +; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-RV64-NEXT: vle64.v v16, (a0) +; CHECK-RV64-NEXT: csrr a1, vlenb +; CHECK-RV64-NEXT: li a2, 24 +; CHECK-RV64-NEXT: mul a1, a1, a2 +; CHECK-RV64-NEXT: add a1, sp, a1 +; CHECK-RV64-NEXT: addi a1, a1, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vmv.x.s a1, v0 +; CHECK-RV64-NEXT: zext.h a1, a1 +; CHECK-RV64-NEXT: cpopw a1, a1 +; CHECK-RV64-NEXT: slli a1, a1, 3 +; CHECK-RV64-NEXT: add a0, a0, a1 +; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 2 +; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64-NEXT: vcpop.m a1, v7 +; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-RV64-NEXT: vle64.v v16, (a0) +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-RV64-NEXT: viota.m v16, v0 +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: addi a0, sp, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: viota.m v16, v7 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 5 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: li a1, 40 +; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: add sp, sp, a0 +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret + %res = call <32 x i64> @llvm.masked.expandload.v32i64(ptr align 8 %base, <32 x i1> %mask, <32 x i64> %passthru) + ret <32 x i64> %res +} + +define <32 x i64> @test_expandload_v32i64_all_ones(ptr %base, <32 x i64> %passthru) { +; CHECK-LABEL: test_expandload_v32i64_all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle64.v v16, (a0) +; CHECK-NEXT: ret + %res = call <32 x i64> @llvm.masked.expandload.v32i64(ptr align 8 %base, <32 x i1> splat (i1 true), <32 x i64> %passthru) + ret <32 x i64> %res +} + +; Tests that will exceed the range of i8 index. + +define <512 x i8> @test_expandload_v512i8(ptr %base, <512 x i1> %mask, <512 x i8> %passthru) vscale_range(16, 1024) { +; CHECK-LABEL: test_expandload_v512i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 512 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vcpop.m a2, v0 +; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: viota.m v16, v0 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, mu +; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t +; CHECK-NEXT: ret + %res = call <512 x i8> @llvm.masked.expandload.v512i8(ptr align 1 %base, <512 x i1> %mask, <512 x i8> %passthru) + ret <512 x i8> %res +} + +; FIXME: We can split it in lowering. +define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <512 x i8> %passthru) vscale_range(8, 1024) { +; CHECK-RV32-LABEL: test_expandload_v512i8_vlen512: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v0 +; CHECK-RV32-NEXT: andi a1, a3, 1 +; CHECK-RV32-NEXT: beqz a1, .LBB61_1 +; CHECK-RV32-NEXT: j .LBB61_544 +; CHECK-RV32-NEXT: .LBB61_1: # %else +; CHECK-RV32-NEXT: andi a1, a3, 2 +; CHECK-RV32-NEXT: beqz a1, .LBB61_2 +; CHECK-RV32-NEXT: j .LBB61_545 +; CHECK-RV32-NEXT: .LBB61_2: # %else2 +; CHECK-RV32-NEXT: andi a1, a3, 4 +; CHECK-RV32-NEXT: beqz a1, .LBB61_3 +; CHECK-RV32-NEXT: j .LBB61_546 +; CHECK-RV32-NEXT: .LBB61_3: # %else6 +; CHECK-RV32-NEXT: andi a1, a3, 8 +; CHECK-RV32-NEXT: beqz a1, .LBB61_4 +; CHECK-RV32-NEXT: j .LBB61_547 +; CHECK-RV32-NEXT: .LBB61_4: # %else10 +; CHECK-RV32-NEXT: andi a1, a3, 16 +; CHECK-RV32-NEXT: beqz a1, .LBB61_5 +; CHECK-RV32-NEXT: j .LBB61_548 +; CHECK-RV32-NEXT: .LBB61_5: # %else14 +; CHECK-RV32-NEXT: andi a1, a3, 32 +; CHECK-RV32-NEXT: beqz a1, .LBB61_6 +; CHECK-RV32-NEXT: j .LBB61_549 +; CHECK-RV32-NEXT: .LBB61_6: # %else18 +; CHECK-RV32-NEXT: andi a1, a3, 64 +; CHECK-RV32-NEXT: beqz a1, .LBB61_7 +; CHECK-RV32-NEXT: j .LBB61_550 +; CHECK-RV32-NEXT: .LBB61_7: # %else22 +; CHECK-RV32-NEXT: andi a1, a3, 128 +; CHECK-RV32-NEXT: beqz a1, .LBB61_8 +; CHECK-RV32-NEXT: j .LBB61_551 +; CHECK-RV32-NEXT: .LBB61_8: # %else26 +; CHECK-RV32-NEXT: andi a1, a3, 256 +; CHECK-RV32-NEXT: beqz a1, .LBB61_9 +; CHECK-RV32-NEXT: j .LBB61_552 +; CHECK-RV32-NEXT: .LBB61_9: # %else30 +; CHECK-RV32-NEXT: andi a1, a3, 512 +; CHECK-RV32-NEXT: beqz a1, .LBB61_10 +; CHECK-RV32-NEXT: j .LBB61_553 +; CHECK-RV32-NEXT: .LBB61_10: # %else34 +; CHECK-RV32-NEXT: andi a1, a3, 1024 +; CHECK-RV32-NEXT: beqz a1, .LBB61_11 +; CHECK-RV32-NEXT: j .LBB61_554 +; CHECK-RV32-NEXT: .LBB61_11: # %else38 +; CHECK-RV32-NEXT: slli a1, a3, 20 +; CHECK-RV32-NEXT: bgez a1, .LBB61_12 +; CHECK-RV32-NEXT: j .LBB61_555 +; CHECK-RV32-NEXT: .LBB61_12: # %else42 +; CHECK-RV32-NEXT: slli a1, a3, 19 +; CHECK-RV32-NEXT: bgez a1, .LBB61_13 +; CHECK-RV32-NEXT: j .LBB61_556 +; CHECK-RV32-NEXT: .LBB61_13: # %else46 +; CHECK-RV32-NEXT: slli a1, a3, 18 +; CHECK-RV32-NEXT: bgez a1, .LBB61_14 +; CHECK-RV32-NEXT: j .LBB61_557 +; CHECK-RV32-NEXT: .LBB61_14: # %else50 +; CHECK-RV32-NEXT: slli a1, a3, 17 +; CHECK-RV32-NEXT: bgez a1, .LBB61_15 +; CHECK-RV32-NEXT: j .LBB61_558 +; CHECK-RV32-NEXT: .LBB61_15: # %else54 +; CHECK-RV32-NEXT: slli a1, a3, 16 +; CHECK-RV32-NEXT: bgez a1, .LBB61_16 +; CHECK-RV32-NEXT: j .LBB61_559 +; CHECK-RV32-NEXT: .LBB61_16: # %else58 +; CHECK-RV32-NEXT: slli a1, a3, 15 +; CHECK-RV32-NEXT: bgez a1, .LBB61_17 +; CHECK-RV32-NEXT: j .LBB61_560 +; CHECK-RV32-NEXT: .LBB61_17: # %else62 +; CHECK-RV32-NEXT: slli a1, a3, 14 +; CHECK-RV32-NEXT: bgez a1, .LBB61_18 +; CHECK-RV32-NEXT: j .LBB61_561 +; CHECK-RV32-NEXT: .LBB61_18: # %else66 +; CHECK-RV32-NEXT: slli a1, a3, 13 +; CHECK-RV32-NEXT: bgez a1, .LBB61_19 +; CHECK-RV32-NEXT: j .LBB61_562 +; CHECK-RV32-NEXT: .LBB61_19: # %else70 +; CHECK-RV32-NEXT: slli a1, a3, 12 +; CHECK-RV32-NEXT: bgez a1, .LBB61_20 +; CHECK-RV32-NEXT: j .LBB61_563 +; CHECK-RV32-NEXT: .LBB61_20: # %else74 +; CHECK-RV32-NEXT: slli a1, a3, 11 +; CHECK-RV32-NEXT: bgez a1, .LBB61_21 +; CHECK-RV32-NEXT: j .LBB61_564 +; CHECK-RV32-NEXT: .LBB61_21: # %else78 +; CHECK-RV32-NEXT: slli a1, a3, 10 +; CHECK-RV32-NEXT: bgez a1, .LBB61_22 +; CHECK-RV32-NEXT: j .LBB61_565 +; CHECK-RV32-NEXT: .LBB61_22: # %else82 +; CHECK-RV32-NEXT: slli a1, a3, 9 +; CHECK-RV32-NEXT: bgez a1, .LBB61_23 +; CHECK-RV32-NEXT: j .LBB61_566 +; CHECK-RV32-NEXT: .LBB61_23: # %else86 +; CHECK-RV32-NEXT: slli a1, a3, 8 +; CHECK-RV32-NEXT: bgez a1, .LBB61_24 +; CHECK-RV32-NEXT: j .LBB61_567 +; CHECK-RV32-NEXT: .LBB61_24: # %else90 +; CHECK-RV32-NEXT: slli a1, a3, 7 +; CHECK-RV32-NEXT: bgez a1, .LBB61_25 +; CHECK-RV32-NEXT: j .LBB61_568 +; CHECK-RV32-NEXT: .LBB61_25: # %else94 +; CHECK-RV32-NEXT: slli a1, a3, 6 +; CHECK-RV32-NEXT: bgez a1, .LBB61_26 +; CHECK-RV32-NEXT: j .LBB61_569 +; CHECK-RV32-NEXT: .LBB61_26: # %else98 +; CHECK-RV32-NEXT: slli a1, a3, 5 +; CHECK-RV32-NEXT: bgez a1, .LBB61_27 +; CHECK-RV32-NEXT: j .LBB61_570 +; CHECK-RV32-NEXT: .LBB61_27: # %else102 +; CHECK-RV32-NEXT: slli a1, a3, 4 +; CHECK-RV32-NEXT: bgez a1, .LBB61_28 +; CHECK-RV32-NEXT: j .LBB61_571 +; CHECK-RV32-NEXT: .LBB61_28: # %else106 +; CHECK-RV32-NEXT: slli a1, a3, 3 +; CHECK-RV32-NEXT: bgez a1, .LBB61_30 +; CHECK-RV32-NEXT: .LBB61_29: # %cond.load109 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 29, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 28 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_30: # %else110 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: li a1, 32 +; CHECK-RV32-NEXT: bgez a2, .LBB61_32 +; CHECK-RV32-NEXT: # %bb.31: # %cond.load113 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 30, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 29 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_32: # %else114 +; CHECK-RV32-NEXT: slli a2, a3, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v16, v0, a1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_34 +; CHECK-RV32-NEXT: # %bb.33: # %cond.load117 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v17, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetivli zero, 31, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vi v8, v17, 30 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_34: # %else118 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a2, v16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_35 +; CHECK-RV32-NEXT: j .LBB61_572 +; CHECK-RV32-NEXT: .LBB61_35: # %else122 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: beqz a3, .LBB61_36 +; CHECK-RV32-NEXT: j .LBB61_573 +; CHECK-RV32-NEXT: .LBB61_36: # %else126 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: beqz a3, .LBB61_37 +; CHECK-RV32-NEXT: j .LBB61_574 +; CHECK-RV32-NEXT: .LBB61_37: # %else130 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: beqz a3, .LBB61_38 +; CHECK-RV32-NEXT: j .LBB61_575 +; CHECK-RV32-NEXT: .LBB61_38: # %else134 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: beqz a3, .LBB61_39 +; CHECK-RV32-NEXT: j .LBB61_576 +; CHECK-RV32-NEXT: .LBB61_39: # %else138 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: beqz a3, .LBB61_40 +; CHECK-RV32-NEXT: j .LBB61_577 +; CHECK-RV32-NEXT: .LBB61_40: # %else142 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: beqz a3, .LBB61_41 +; CHECK-RV32-NEXT: j .LBB61_578 +; CHECK-RV32-NEXT: .LBB61_41: # %else146 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: beqz a3, .LBB61_42 +; CHECK-RV32-NEXT: j .LBB61_579 +; CHECK-RV32-NEXT: .LBB61_42: # %else150 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: beqz a3, .LBB61_43 +; CHECK-RV32-NEXT: j .LBB61_580 +; CHECK-RV32-NEXT: .LBB61_43: # %else154 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: beqz a3, .LBB61_44 +; CHECK-RV32-NEXT: j .LBB61_581 +; CHECK-RV32-NEXT: .LBB61_44: # %else158 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: beqz a3, .LBB61_45 +; CHECK-RV32-NEXT: j .LBB61_582 +; CHECK-RV32-NEXT: .LBB61_45: # %else162 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: beqz a3, .LBB61_46 +; CHECK-RV32-NEXT: j .LBB61_583 +; CHECK-RV32-NEXT: .LBB61_46: # %else166 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bgez a3, .LBB61_47 +; CHECK-RV32-NEXT: j .LBB61_584 +; CHECK-RV32-NEXT: .LBB61_47: # %else170 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bgez a3, .LBB61_48 +; CHECK-RV32-NEXT: j .LBB61_585 +; CHECK-RV32-NEXT: .LBB61_48: # %else174 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bgez a3, .LBB61_49 +; CHECK-RV32-NEXT: j .LBB61_586 +; CHECK-RV32-NEXT: .LBB61_49: # %else178 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bgez a3, .LBB61_50 +; CHECK-RV32-NEXT: j .LBB61_587 +; CHECK-RV32-NEXT: .LBB61_50: # %else182 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_51 +; CHECK-RV32-NEXT: j .LBB61_588 +; CHECK-RV32-NEXT: .LBB61_51: # %else186 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bgez a3, .LBB61_52 +; CHECK-RV32-NEXT: j .LBB61_589 +; CHECK-RV32-NEXT: .LBB61_52: # %else190 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bgez a3, .LBB61_53 +; CHECK-RV32-NEXT: j .LBB61_590 +; CHECK-RV32-NEXT: .LBB61_53: # %else194 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bgez a3, .LBB61_54 +; CHECK-RV32-NEXT: j .LBB61_591 +; CHECK-RV32-NEXT: .LBB61_54: # %else198 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bgez a3, .LBB61_55 +; CHECK-RV32-NEXT: j .LBB61_592 +; CHECK-RV32-NEXT: .LBB61_55: # %else202 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bgez a3, .LBB61_56 +; CHECK-RV32-NEXT: j .LBB61_593 +; CHECK-RV32-NEXT: .LBB61_56: # %else206 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bgez a3, .LBB61_57 +; CHECK-RV32-NEXT: j .LBB61_594 +; CHECK-RV32-NEXT: .LBB61_57: # %else210 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bgez a3, .LBB61_58 +; CHECK-RV32-NEXT: j .LBB61_595 +; CHECK-RV32-NEXT: .LBB61_58: # %else214 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bgez a3, .LBB61_59 +; CHECK-RV32-NEXT: j .LBB61_596 +; CHECK-RV32-NEXT: .LBB61_59: # %else218 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bgez a3, .LBB61_60 +; CHECK-RV32-NEXT: j .LBB61_597 +; CHECK-RV32-NEXT: .LBB61_60: # %else222 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bgez a3, .LBB61_61 +; CHECK-RV32-NEXT: j .LBB61_598 +; CHECK-RV32-NEXT: .LBB61_61: # %else226 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bgez a3, .LBB61_62 +; CHECK-RV32-NEXT: j .LBB61_599 +; CHECK-RV32-NEXT: .LBB61_62: # %else230 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bgez a3, .LBB61_63 +; CHECK-RV32-NEXT: j .LBB61_600 +; CHECK-RV32-NEXT: .LBB61_63: # %else234 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bgez a3, .LBB61_64 +; CHECK-RV32-NEXT: j .LBB61_601 +; CHECK-RV32-NEXT: .LBB61_64: # %else238 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_66 +; CHECK-RV32-NEXT: .LBB61_65: # %cond.load241 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 62 +; CHECK-RV32-NEXT: li a4, 61 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: .LBB61_66: # %else242 +; CHECK-RV32-NEXT: slli a3, a2, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 1 +; CHECK-RV32-NEXT: bgez a3, .LBB61_68 +; CHECK-RV32-NEXT: # %bb.67: # %cond.load245 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v17, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 63 +; CHECK-RV32-NEXT: li a4, 62 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v17, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_68: # %else246 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_69 +; CHECK-RV32-NEXT: j .LBB61_602 +; CHECK-RV32-NEXT: .LBB61_69: # %else250 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: beqz a2, .LBB61_70 +; CHECK-RV32-NEXT: j .LBB61_603 +; CHECK-RV32-NEXT: .LBB61_70: # %else254 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: beqz a2, .LBB61_71 +; CHECK-RV32-NEXT: j .LBB61_604 +; CHECK-RV32-NEXT: .LBB61_71: # %else258 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: beqz a2, .LBB61_72 +; CHECK-RV32-NEXT: j .LBB61_605 +; CHECK-RV32-NEXT: .LBB61_72: # %else262 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: beqz a2, .LBB61_73 +; CHECK-RV32-NEXT: j .LBB61_606 +; CHECK-RV32-NEXT: .LBB61_73: # %else266 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: beqz a2, .LBB61_74 +; CHECK-RV32-NEXT: j .LBB61_607 +; CHECK-RV32-NEXT: .LBB61_74: # %else270 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: beqz a2, .LBB61_75 +; CHECK-RV32-NEXT: j .LBB61_608 +; CHECK-RV32-NEXT: .LBB61_75: # %else274 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: beqz a2, .LBB61_76 +; CHECK-RV32-NEXT: j .LBB61_609 +; CHECK-RV32-NEXT: .LBB61_76: # %else278 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: beqz a2, .LBB61_77 +; CHECK-RV32-NEXT: j .LBB61_610 +; CHECK-RV32-NEXT: .LBB61_77: # %else282 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: beqz a2, .LBB61_78 +; CHECK-RV32-NEXT: j .LBB61_611 +; CHECK-RV32-NEXT: .LBB61_78: # %else286 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: beqz a2, .LBB61_79 +; CHECK-RV32-NEXT: j .LBB61_612 +; CHECK-RV32-NEXT: .LBB61_79: # %else290 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: beqz a2, .LBB61_80 +; CHECK-RV32-NEXT: j .LBB61_613 +; CHECK-RV32-NEXT: .LBB61_80: # %else294 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bgez a2, .LBB61_81 +; CHECK-RV32-NEXT: j .LBB61_614 +; CHECK-RV32-NEXT: .LBB61_81: # %else298 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bgez a2, .LBB61_82 +; CHECK-RV32-NEXT: j .LBB61_615 +; CHECK-RV32-NEXT: .LBB61_82: # %else302 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bgez a2, .LBB61_83 +; CHECK-RV32-NEXT: j .LBB61_616 +; CHECK-RV32-NEXT: .LBB61_83: # %else306 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bgez a2, .LBB61_84 +; CHECK-RV32-NEXT: j .LBB61_617 +; CHECK-RV32-NEXT: .LBB61_84: # %else310 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_85 +; CHECK-RV32-NEXT: j .LBB61_618 +; CHECK-RV32-NEXT: .LBB61_85: # %else314 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bgez a2, .LBB61_86 +; CHECK-RV32-NEXT: j .LBB61_619 +; CHECK-RV32-NEXT: .LBB61_86: # %else318 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bgez a2, .LBB61_87 +; CHECK-RV32-NEXT: j .LBB61_620 +; CHECK-RV32-NEXT: .LBB61_87: # %else322 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bgez a2, .LBB61_88 +; CHECK-RV32-NEXT: j .LBB61_621 +; CHECK-RV32-NEXT: .LBB61_88: # %else326 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bgez a2, .LBB61_89 +; CHECK-RV32-NEXT: j .LBB61_622 +; CHECK-RV32-NEXT: .LBB61_89: # %else330 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bgez a2, .LBB61_90 +; CHECK-RV32-NEXT: j .LBB61_623 +; CHECK-RV32-NEXT: .LBB61_90: # %else334 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bgez a2, .LBB61_91 +; CHECK-RV32-NEXT: j .LBB61_624 +; CHECK-RV32-NEXT: .LBB61_91: # %else338 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bgez a2, .LBB61_92 +; CHECK-RV32-NEXT: j .LBB61_625 +; CHECK-RV32-NEXT: .LBB61_92: # %else342 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bgez a2, .LBB61_93 +; CHECK-RV32-NEXT: j .LBB61_626 +; CHECK-RV32-NEXT: .LBB61_93: # %else346 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bgez a2, .LBB61_94 +; CHECK-RV32-NEXT: j .LBB61_627 +; CHECK-RV32-NEXT: .LBB61_94: # %else350 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bgez a2, .LBB61_95 +; CHECK-RV32-NEXT: j .LBB61_628 +; CHECK-RV32-NEXT: .LBB61_95: # %else354 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bgez a2, .LBB61_96 +; CHECK-RV32-NEXT: j .LBB61_629 +; CHECK-RV32-NEXT: .LBB61_96: # %else358 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bgez a2, .LBB61_97 +; CHECK-RV32-NEXT: j .LBB61_630 +; CHECK-RV32-NEXT: .LBB61_97: # %else362 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bgez a2, .LBB61_98 +; CHECK-RV32-NEXT: j .LBB61_631 +; CHECK-RV32-NEXT: .LBB61_98: # %else366 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_100 +; CHECK-RV32-NEXT: .LBB61_99: # %cond.load369 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 94 +; CHECK-RV32-NEXT: li a4, 93 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_100: # %else370 +; CHECK-RV32-NEXT: slli a2, a3, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_102 +; CHECK-RV32-NEXT: # %bb.101: # %cond.load373 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 95 +; CHECK-RV32-NEXT: li a4, 94 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_102: # %else374 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a2, v16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_103 +; CHECK-RV32-NEXT: j .LBB61_632 +; CHECK-RV32-NEXT: .LBB61_103: # %else378 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: beqz a3, .LBB61_104 +; CHECK-RV32-NEXT: j .LBB61_633 +; CHECK-RV32-NEXT: .LBB61_104: # %else382 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: beqz a3, .LBB61_105 +; CHECK-RV32-NEXT: j .LBB61_634 +; CHECK-RV32-NEXT: .LBB61_105: # %else386 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: beqz a3, .LBB61_106 +; CHECK-RV32-NEXT: j .LBB61_635 +; CHECK-RV32-NEXT: .LBB61_106: # %else390 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: beqz a3, .LBB61_107 +; CHECK-RV32-NEXT: j .LBB61_636 +; CHECK-RV32-NEXT: .LBB61_107: # %else394 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: beqz a3, .LBB61_108 +; CHECK-RV32-NEXT: j .LBB61_637 +; CHECK-RV32-NEXT: .LBB61_108: # %else398 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: beqz a3, .LBB61_109 +; CHECK-RV32-NEXT: j .LBB61_638 +; CHECK-RV32-NEXT: .LBB61_109: # %else402 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: beqz a3, .LBB61_110 +; CHECK-RV32-NEXT: j .LBB61_639 +; CHECK-RV32-NEXT: .LBB61_110: # %else406 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: beqz a3, .LBB61_111 +; CHECK-RV32-NEXT: j .LBB61_640 +; CHECK-RV32-NEXT: .LBB61_111: # %else410 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: beqz a3, .LBB61_112 +; CHECK-RV32-NEXT: j .LBB61_641 +; CHECK-RV32-NEXT: .LBB61_112: # %else414 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: beqz a3, .LBB61_113 +; CHECK-RV32-NEXT: j .LBB61_642 +; CHECK-RV32-NEXT: .LBB61_113: # %else418 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: beqz a3, .LBB61_114 +; CHECK-RV32-NEXT: j .LBB61_643 +; CHECK-RV32-NEXT: .LBB61_114: # %else422 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bgez a3, .LBB61_115 +; CHECK-RV32-NEXT: j .LBB61_644 +; CHECK-RV32-NEXT: .LBB61_115: # %else426 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bgez a3, .LBB61_116 +; CHECK-RV32-NEXT: j .LBB61_645 +; CHECK-RV32-NEXT: .LBB61_116: # %else430 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bgez a3, .LBB61_117 +; CHECK-RV32-NEXT: j .LBB61_646 +; CHECK-RV32-NEXT: .LBB61_117: # %else434 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bgez a3, .LBB61_118 +; CHECK-RV32-NEXT: j .LBB61_647 +; CHECK-RV32-NEXT: .LBB61_118: # %else438 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_119 +; CHECK-RV32-NEXT: j .LBB61_648 +; CHECK-RV32-NEXT: .LBB61_119: # %else442 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bgez a3, .LBB61_120 +; CHECK-RV32-NEXT: j .LBB61_649 +; CHECK-RV32-NEXT: .LBB61_120: # %else446 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bgez a3, .LBB61_121 +; CHECK-RV32-NEXT: j .LBB61_650 +; CHECK-RV32-NEXT: .LBB61_121: # %else450 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bgez a3, .LBB61_122 +; CHECK-RV32-NEXT: j .LBB61_651 +; CHECK-RV32-NEXT: .LBB61_122: # %else454 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bgez a3, .LBB61_123 +; CHECK-RV32-NEXT: j .LBB61_652 +; CHECK-RV32-NEXT: .LBB61_123: # %else458 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bgez a3, .LBB61_124 +; CHECK-RV32-NEXT: j .LBB61_653 +; CHECK-RV32-NEXT: .LBB61_124: # %else462 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bgez a3, .LBB61_125 +; CHECK-RV32-NEXT: j .LBB61_654 +; CHECK-RV32-NEXT: .LBB61_125: # %else466 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bgez a3, .LBB61_126 +; CHECK-RV32-NEXT: j .LBB61_655 +; CHECK-RV32-NEXT: .LBB61_126: # %else470 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bgez a3, .LBB61_127 +; CHECK-RV32-NEXT: j .LBB61_656 +; CHECK-RV32-NEXT: .LBB61_127: # %else474 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bgez a3, .LBB61_128 +; CHECK-RV32-NEXT: j .LBB61_657 +; CHECK-RV32-NEXT: .LBB61_128: # %else478 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bgez a3, .LBB61_129 +; CHECK-RV32-NEXT: j .LBB61_658 +; CHECK-RV32-NEXT: .LBB61_129: # %else482 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bgez a3, .LBB61_130 +; CHECK-RV32-NEXT: j .LBB61_659 +; CHECK-RV32-NEXT: .LBB61_130: # %else486 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bgez a3, .LBB61_131 +; CHECK-RV32-NEXT: j .LBB61_660 +; CHECK-RV32-NEXT: .LBB61_131: # %else490 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bgez a3, .LBB61_132 +; CHECK-RV32-NEXT: j .LBB61_661 +; CHECK-RV32-NEXT: .LBB61_132: # %else494 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_134 +; CHECK-RV32-NEXT: .LBB61_133: # %cond.load497 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 126 +; CHECK-RV32-NEXT: li a4, 125 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: .LBB61_134: # %else498 +; CHECK-RV32-NEXT: slli a3, a2, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_136 +; CHECK-RV32-NEXT: # %bb.135: # %cond.load501 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v18, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 127 +; CHECK-RV32-NEXT: li a4, 126 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_136: # %else502 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_137 +; CHECK-RV32-NEXT: j .LBB61_662 +; CHECK-RV32-NEXT: .LBB61_137: # %else506 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: beqz a2, .LBB61_138 +; CHECK-RV32-NEXT: j .LBB61_663 +; CHECK-RV32-NEXT: .LBB61_138: # %else510 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: beqz a2, .LBB61_139 +; CHECK-RV32-NEXT: j .LBB61_664 +; CHECK-RV32-NEXT: .LBB61_139: # %else514 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: beqz a2, .LBB61_140 +; CHECK-RV32-NEXT: j .LBB61_665 +; CHECK-RV32-NEXT: .LBB61_140: # %else518 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: beqz a2, .LBB61_141 +; CHECK-RV32-NEXT: j .LBB61_666 +; CHECK-RV32-NEXT: .LBB61_141: # %else522 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: beqz a2, .LBB61_142 +; CHECK-RV32-NEXT: j .LBB61_667 +; CHECK-RV32-NEXT: .LBB61_142: # %else526 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: beqz a2, .LBB61_143 +; CHECK-RV32-NEXT: j .LBB61_668 +; CHECK-RV32-NEXT: .LBB61_143: # %else530 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: beqz a2, .LBB61_144 +; CHECK-RV32-NEXT: j .LBB61_669 +; CHECK-RV32-NEXT: .LBB61_144: # %else534 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: beqz a2, .LBB61_145 +; CHECK-RV32-NEXT: j .LBB61_670 +; CHECK-RV32-NEXT: .LBB61_145: # %else538 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: beqz a2, .LBB61_146 +; CHECK-RV32-NEXT: j .LBB61_671 +; CHECK-RV32-NEXT: .LBB61_146: # %else542 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: beqz a2, .LBB61_147 +; CHECK-RV32-NEXT: j .LBB61_672 +; CHECK-RV32-NEXT: .LBB61_147: # %else546 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: beqz a2, .LBB61_148 +; CHECK-RV32-NEXT: j .LBB61_673 +; CHECK-RV32-NEXT: .LBB61_148: # %else550 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bgez a2, .LBB61_149 +; CHECK-RV32-NEXT: j .LBB61_674 +; CHECK-RV32-NEXT: .LBB61_149: # %else554 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bgez a2, .LBB61_150 +; CHECK-RV32-NEXT: j .LBB61_675 +; CHECK-RV32-NEXT: .LBB61_150: # %else558 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bgez a2, .LBB61_151 +; CHECK-RV32-NEXT: j .LBB61_676 +; CHECK-RV32-NEXT: .LBB61_151: # %else562 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bgez a2, .LBB61_152 +; CHECK-RV32-NEXT: j .LBB61_677 +; CHECK-RV32-NEXT: .LBB61_152: # %else566 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_153 +; CHECK-RV32-NEXT: j .LBB61_678 +; CHECK-RV32-NEXT: .LBB61_153: # %else570 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bgez a2, .LBB61_154 +; CHECK-RV32-NEXT: j .LBB61_679 +; CHECK-RV32-NEXT: .LBB61_154: # %else574 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bgez a2, .LBB61_155 +; CHECK-RV32-NEXT: j .LBB61_680 +; CHECK-RV32-NEXT: .LBB61_155: # %else578 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bgez a2, .LBB61_156 +; CHECK-RV32-NEXT: j .LBB61_681 +; CHECK-RV32-NEXT: .LBB61_156: # %else582 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bgez a2, .LBB61_157 +; CHECK-RV32-NEXT: j .LBB61_682 +; CHECK-RV32-NEXT: .LBB61_157: # %else586 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bgez a2, .LBB61_158 +; CHECK-RV32-NEXT: j .LBB61_683 +; CHECK-RV32-NEXT: .LBB61_158: # %else590 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bgez a2, .LBB61_159 +; CHECK-RV32-NEXT: j .LBB61_684 +; CHECK-RV32-NEXT: .LBB61_159: # %else594 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bgez a2, .LBB61_160 +; CHECK-RV32-NEXT: j .LBB61_685 +; CHECK-RV32-NEXT: .LBB61_160: # %else598 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bgez a2, .LBB61_161 +; CHECK-RV32-NEXT: j .LBB61_686 +; CHECK-RV32-NEXT: .LBB61_161: # %else602 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bgez a2, .LBB61_162 +; CHECK-RV32-NEXT: j .LBB61_687 +; CHECK-RV32-NEXT: .LBB61_162: # %else606 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bgez a2, .LBB61_163 +; CHECK-RV32-NEXT: j .LBB61_688 +; CHECK-RV32-NEXT: .LBB61_163: # %else610 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bgez a2, .LBB61_164 +; CHECK-RV32-NEXT: j .LBB61_689 +; CHECK-RV32-NEXT: .LBB61_164: # %else614 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bgez a2, .LBB61_165 +; CHECK-RV32-NEXT: j .LBB61_690 +; CHECK-RV32-NEXT: .LBB61_165: # %else618 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bgez a2, .LBB61_166 +; CHECK-RV32-NEXT: j .LBB61_691 +; CHECK-RV32-NEXT: .LBB61_166: # %else622 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_168 +; CHECK-RV32-NEXT: .LBB61_167: # %cond.load625 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 158 +; CHECK-RV32-NEXT: li a4, 157 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_168: # %else626 +; CHECK-RV32-NEXT: slli a2, a3, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_170 +; CHECK-RV32-NEXT: # %bb.169: # %cond.load629 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 159 +; CHECK-RV32-NEXT: li a4, 158 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_170: # %else630 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a2, v16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_171 +; CHECK-RV32-NEXT: j .LBB61_692 +; CHECK-RV32-NEXT: .LBB61_171: # %else634 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: beqz a3, .LBB61_172 +; CHECK-RV32-NEXT: j .LBB61_693 +; CHECK-RV32-NEXT: .LBB61_172: # %else638 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: beqz a3, .LBB61_173 +; CHECK-RV32-NEXT: j .LBB61_694 +; CHECK-RV32-NEXT: .LBB61_173: # %else642 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: beqz a3, .LBB61_174 +; CHECK-RV32-NEXT: j .LBB61_695 +; CHECK-RV32-NEXT: .LBB61_174: # %else646 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: beqz a3, .LBB61_175 +; CHECK-RV32-NEXT: j .LBB61_696 +; CHECK-RV32-NEXT: .LBB61_175: # %else650 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: beqz a3, .LBB61_176 +; CHECK-RV32-NEXT: j .LBB61_697 +; CHECK-RV32-NEXT: .LBB61_176: # %else654 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: beqz a3, .LBB61_177 +; CHECK-RV32-NEXT: j .LBB61_698 +; CHECK-RV32-NEXT: .LBB61_177: # %else658 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: beqz a3, .LBB61_178 +; CHECK-RV32-NEXT: j .LBB61_699 +; CHECK-RV32-NEXT: .LBB61_178: # %else662 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: beqz a3, .LBB61_179 +; CHECK-RV32-NEXT: j .LBB61_700 +; CHECK-RV32-NEXT: .LBB61_179: # %else666 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: beqz a3, .LBB61_180 +; CHECK-RV32-NEXT: j .LBB61_701 +; CHECK-RV32-NEXT: .LBB61_180: # %else670 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: beqz a3, .LBB61_181 +; CHECK-RV32-NEXT: j .LBB61_702 +; CHECK-RV32-NEXT: .LBB61_181: # %else674 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: beqz a3, .LBB61_182 +; CHECK-RV32-NEXT: j .LBB61_703 +; CHECK-RV32-NEXT: .LBB61_182: # %else678 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bgez a3, .LBB61_183 +; CHECK-RV32-NEXT: j .LBB61_704 +; CHECK-RV32-NEXT: .LBB61_183: # %else682 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bgez a3, .LBB61_184 +; CHECK-RV32-NEXT: j .LBB61_705 +; CHECK-RV32-NEXT: .LBB61_184: # %else686 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bgez a3, .LBB61_185 +; CHECK-RV32-NEXT: j .LBB61_706 +; CHECK-RV32-NEXT: .LBB61_185: # %else690 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bgez a3, .LBB61_186 +; CHECK-RV32-NEXT: j .LBB61_707 +; CHECK-RV32-NEXT: .LBB61_186: # %else694 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_187 +; CHECK-RV32-NEXT: j .LBB61_708 +; CHECK-RV32-NEXT: .LBB61_187: # %else698 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bgez a3, .LBB61_188 +; CHECK-RV32-NEXT: j .LBB61_709 +; CHECK-RV32-NEXT: .LBB61_188: # %else702 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bgez a3, .LBB61_189 +; CHECK-RV32-NEXT: j .LBB61_710 +; CHECK-RV32-NEXT: .LBB61_189: # %else706 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bgez a3, .LBB61_190 +; CHECK-RV32-NEXT: j .LBB61_711 +; CHECK-RV32-NEXT: .LBB61_190: # %else710 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bgez a3, .LBB61_191 +; CHECK-RV32-NEXT: j .LBB61_712 +; CHECK-RV32-NEXT: .LBB61_191: # %else714 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bgez a3, .LBB61_192 +; CHECK-RV32-NEXT: j .LBB61_713 +; CHECK-RV32-NEXT: .LBB61_192: # %else718 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bgez a3, .LBB61_193 +; CHECK-RV32-NEXT: j .LBB61_714 +; CHECK-RV32-NEXT: .LBB61_193: # %else722 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bgez a3, .LBB61_194 +; CHECK-RV32-NEXT: j .LBB61_715 +; CHECK-RV32-NEXT: .LBB61_194: # %else726 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bgez a3, .LBB61_195 +; CHECK-RV32-NEXT: j .LBB61_716 +; CHECK-RV32-NEXT: .LBB61_195: # %else730 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bgez a3, .LBB61_196 +; CHECK-RV32-NEXT: j .LBB61_717 +; CHECK-RV32-NEXT: .LBB61_196: # %else734 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bgez a3, .LBB61_197 +; CHECK-RV32-NEXT: j .LBB61_718 +; CHECK-RV32-NEXT: .LBB61_197: # %else738 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bgez a3, .LBB61_198 +; CHECK-RV32-NEXT: j .LBB61_719 +; CHECK-RV32-NEXT: .LBB61_198: # %else742 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bgez a3, .LBB61_199 +; CHECK-RV32-NEXT: j .LBB61_720 +; CHECK-RV32-NEXT: .LBB61_199: # %else746 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bgez a3, .LBB61_200 +; CHECK-RV32-NEXT: j .LBB61_721 +; CHECK-RV32-NEXT: .LBB61_200: # %else750 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_202 +; CHECK-RV32-NEXT: .LBB61_201: # %cond.load753 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 190 +; CHECK-RV32-NEXT: li a4, 189 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_202: # %else754 +; CHECK-RV32-NEXT: slli a3, a2, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 3 +; CHECK-RV32-NEXT: bgez a3, .LBB61_204 +; CHECK-RV32-NEXT: # %bb.203: # %cond.load757 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v20, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 191 +; CHECK-RV32-NEXT: li a4, 190 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_204: # %else758 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_205 +; CHECK-RV32-NEXT: j .LBB61_722 +; CHECK-RV32-NEXT: .LBB61_205: # %else762 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: beqz a2, .LBB61_206 +; CHECK-RV32-NEXT: j .LBB61_723 +; CHECK-RV32-NEXT: .LBB61_206: # %else766 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: beqz a2, .LBB61_207 +; CHECK-RV32-NEXT: j .LBB61_724 +; CHECK-RV32-NEXT: .LBB61_207: # %else770 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: beqz a2, .LBB61_208 +; CHECK-RV32-NEXT: j .LBB61_725 +; CHECK-RV32-NEXT: .LBB61_208: # %else774 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: beqz a2, .LBB61_209 +; CHECK-RV32-NEXT: j .LBB61_726 +; CHECK-RV32-NEXT: .LBB61_209: # %else778 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: beqz a2, .LBB61_210 +; CHECK-RV32-NEXT: j .LBB61_727 +; CHECK-RV32-NEXT: .LBB61_210: # %else782 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: beqz a2, .LBB61_211 +; CHECK-RV32-NEXT: j .LBB61_728 +; CHECK-RV32-NEXT: .LBB61_211: # %else786 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: beqz a2, .LBB61_212 +; CHECK-RV32-NEXT: j .LBB61_729 +; CHECK-RV32-NEXT: .LBB61_212: # %else790 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: beqz a2, .LBB61_213 +; CHECK-RV32-NEXT: j .LBB61_730 +; CHECK-RV32-NEXT: .LBB61_213: # %else794 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: beqz a2, .LBB61_214 +; CHECK-RV32-NEXT: j .LBB61_731 +; CHECK-RV32-NEXT: .LBB61_214: # %else798 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: beqz a2, .LBB61_215 +; CHECK-RV32-NEXT: j .LBB61_732 +; CHECK-RV32-NEXT: .LBB61_215: # %else802 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: beqz a2, .LBB61_216 +; CHECK-RV32-NEXT: j .LBB61_733 +; CHECK-RV32-NEXT: .LBB61_216: # %else806 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bgez a2, .LBB61_217 +; CHECK-RV32-NEXT: j .LBB61_734 +; CHECK-RV32-NEXT: .LBB61_217: # %else810 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bgez a2, .LBB61_218 +; CHECK-RV32-NEXT: j .LBB61_735 +; CHECK-RV32-NEXT: .LBB61_218: # %else814 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bgez a2, .LBB61_219 +; CHECK-RV32-NEXT: j .LBB61_736 +; CHECK-RV32-NEXT: .LBB61_219: # %else818 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bgez a2, .LBB61_220 +; CHECK-RV32-NEXT: j .LBB61_737 +; CHECK-RV32-NEXT: .LBB61_220: # %else822 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_221 +; CHECK-RV32-NEXT: j .LBB61_738 +; CHECK-RV32-NEXT: .LBB61_221: # %else826 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bgez a2, .LBB61_222 +; CHECK-RV32-NEXT: j .LBB61_739 +; CHECK-RV32-NEXT: .LBB61_222: # %else830 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bgez a2, .LBB61_223 +; CHECK-RV32-NEXT: j .LBB61_740 +; CHECK-RV32-NEXT: .LBB61_223: # %else834 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bgez a2, .LBB61_224 +; CHECK-RV32-NEXT: j .LBB61_741 +; CHECK-RV32-NEXT: .LBB61_224: # %else838 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bgez a2, .LBB61_225 +; CHECK-RV32-NEXT: j .LBB61_742 +; CHECK-RV32-NEXT: .LBB61_225: # %else842 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bgez a2, .LBB61_226 +; CHECK-RV32-NEXT: j .LBB61_743 +; CHECK-RV32-NEXT: .LBB61_226: # %else846 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bgez a2, .LBB61_227 +; CHECK-RV32-NEXT: j .LBB61_744 +; CHECK-RV32-NEXT: .LBB61_227: # %else850 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bgez a2, .LBB61_228 +; CHECK-RV32-NEXT: j .LBB61_745 +; CHECK-RV32-NEXT: .LBB61_228: # %else854 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bgez a2, .LBB61_229 +; CHECK-RV32-NEXT: j .LBB61_746 +; CHECK-RV32-NEXT: .LBB61_229: # %else858 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bgez a2, .LBB61_230 +; CHECK-RV32-NEXT: j .LBB61_747 +; CHECK-RV32-NEXT: .LBB61_230: # %else862 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bgez a2, .LBB61_231 +; CHECK-RV32-NEXT: j .LBB61_748 +; CHECK-RV32-NEXT: .LBB61_231: # %else866 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bgez a2, .LBB61_232 +; CHECK-RV32-NEXT: j .LBB61_749 +; CHECK-RV32-NEXT: .LBB61_232: # %else870 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bgez a2, .LBB61_233 +; CHECK-RV32-NEXT: j .LBB61_750 +; CHECK-RV32-NEXT: .LBB61_233: # %else874 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bgez a2, .LBB61_234 +; CHECK-RV32-NEXT: j .LBB61_751 +; CHECK-RV32-NEXT: .LBB61_234: # %else878 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_236 +; CHECK-RV32-NEXT: .LBB61_235: # %cond.load881 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 222 +; CHECK-RV32-NEXT: li a4, 221 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_236: # %else882 +; CHECK-RV32-NEXT: slli a2, a3, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_238 +; CHECK-RV32-NEXT: # %bb.237: # %cond.load885 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 223 +; CHECK-RV32-NEXT: li a4, 222 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_238: # %else886 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a2, v16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_239 +; CHECK-RV32-NEXT: j .LBB61_752 +; CHECK-RV32-NEXT: .LBB61_239: # %else890 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: beqz a3, .LBB61_240 +; CHECK-RV32-NEXT: j .LBB61_753 +; CHECK-RV32-NEXT: .LBB61_240: # %else894 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: beqz a3, .LBB61_241 +; CHECK-RV32-NEXT: j .LBB61_754 +; CHECK-RV32-NEXT: .LBB61_241: # %else898 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: beqz a3, .LBB61_242 +; CHECK-RV32-NEXT: j .LBB61_755 +; CHECK-RV32-NEXT: .LBB61_242: # %else902 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: beqz a3, .LBB61_243 +; CHECK-RV32-NEXT: j .LBB61_756 +; CHECK-RV32-NEXT: .LBB61_243: # %else906 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: beqz a3, .LBB61_244 +; CHECK-RV32-NEXT: j .LBB61_757 +; CHECK-RV32-NEXT: .LBB61_244: # %else910 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: beqz a3, .LBB61_245 +; CHECK-RV32-NEXT: j .LBB61_758 +; CHECK-RV32-NEXT: .LBB61_245: # %else914 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: beqz a3, .LBB61_246 +; CHECK-RV32-NEXT: j .LBB61_759 +; CHECK-RV32-NEXT: .LBB61_246: # %else918 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: beqz a3, .LBB61_247 +; CHECK-RV32-NEXT: j .LBB61_760 +; CHECK-RV32-NEXT: .LBB61_247: # %else922 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: beqz a3, .LBB61_248 +; CHECK-RV32-NEXT: j .LBB61_761 +; CHECK-RV32-NEXT: .LBB61_248: # %else926 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: beqz a3, .LBB61_249 +; CHECK-RV32-NEXT: j .LBB61_762 +; CHECK-RV32-NEXT: .LBB61_249: # %else930 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: beqz a3, .LBB61_250 +; CHECK-RV32-NEXT: j .LBB61_763 +; CHECK-RV32-NEXT: .LBB61_250: # %else934 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bgez a3, .LBB61_251 +; CHECK-RV32-NEXT: j .LBB61_764 +; CHECK-RV32-NEXT: .LBB61_251: # %else938 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bgez a3, .LBB61_252 +; CHECK-RV32-NEXT: j .LBB61_765 +; CHECK-RV32-NEXT: .LBB61_252: # %else942 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bgez a3, .LBB61_253 +; CHECK-RV32-NEXT: j .LBB61_766 +; CHECK-RV32-NEXT: .LBB61_253: # %else946 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bgez a3, .LBB61_254 +; CHECK-RV32-NEXT: j .LBB61_767 +; CHECK-RV32-NEXT: .LBB61_254: # %else950 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_255 +; CHECK-RV32-NEXT: j .LBB61_768 +; CHECK-RV32-NEXT: .LBB61_255: # %else954 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bgez a3, .LBB61_256 +; CHECK-RV32-NEXT: j .LBB61_769 +; CHECK-RV32-NEXT: .LBB61_256: # %else958 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bgez a3, .LBB61_257 +; CHECK-RV32-NEXT: j .LBB61_770 +; CHECK-RV32-NEXT: .LBB61_257: # %else962 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bgez a3, .LBB61_258 +; CHECK-RV32-NEXT: j .LBB61_771 +; CHECK-RV32-NEXT: .LBB61_258: # %else966 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bgez a3, .LBB61_259 +; CHECK-RV32-NEXT: j .LBB61_772 +; CHECK-RV32-NEXT: .LBB61_259: # %else970 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bgez a3, .LBB61_260 +; CHECK-RV32-NEXT: j .LBB61_773 +; CHECK-RV32-NEXT: .LBB61_260: # %else974 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bgez a3, .LBB61_261 +; CHECK-RV32-NEXT: j .LBB61_774 +; CHECK-RV32-NEXT: .LBB61_261: # %else978 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bgez a3, .LBB61_262 +; CHECK-RV32-NEXT: j .LBB61_775 +; CHECK-RV32-NEXT: .LBB61_262: # %else982 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bgez a3, .LBB61_263 +; CHECK-RV32-NEXT: j .LBB61_776 +; CHECK-RV32-NEXT: .LBB61_263: # %else986 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bgez a3, .LBB61_264 +; CHECK-RV32-NEXT: j .LBB61_777 +; CHECK-RV32-NEXT: .LBB61_264: # %else990 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bgez a3, .LBB61_265 +; CHECK-RV32-NEXT: j .LBB61_778 +; CHECK-RV32-NEXT: .LBB61_265: # %else994 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bgez a3, .LBB61_266 +; CHECK-RV32-NEXT: j .LBB61_779 +; CHECK-RV32-NEXT: .LBB61_266: # %else998 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bgez a3, .LBB61_267 +; CHECK-RV32-NEXT: j .LBB61_780 +; CHECK-RV32-NEXT: .LBB61_267: # %else1002 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bgez a3, .LBB61_268 +; CHECK-RV32-NEXT: j .LBB61_781 +; CHECK-RV32-NEXT: .LBB61_268: # %else1006 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_270 +; CHECK-RV32-NEXT: .LBB61_269: # %cond.load1009 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 254 +; CHECK-RV32-NEXT: li a4, 253 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_270: # %else1010 +; CHECK-RV32-NEXT: slli a3, a2, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 4 +; CHECK-RV32-NEXT: bgez a3, .LBB61_272 +; CHECK-RV32-NEXT: # %bb.271: # %cond.load1013 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v20, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 255 +; CHECK-RV32-NEXT: li a4, 254 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: .LBB61_272: # %else1014 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_273 +; CHECK-RV32-NEXT: j .LBB61_782 +; CHECK-RV32-NEXT: .LBB61_273: # %else1018 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: beqz a2, .LBB61_274 +; CHECK-RV32-NEXT: j .LBB61_783 +; CHECK-RV32-NEXT: .LBB61_274: # %else1022 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: beqz a2, .LBB61_275 +; CHECK-RV32-NEXT: j .LBB61_784 +; CHECK-RV32-NEXT: .LBB61_275: # %else1026 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: beqz a2, .LBB61_276 +; CHECK-RV32-NEXT: j .LBB61_785 +; CHECK-RV32-NEXT: .LBB61_276: # %else1030 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: beqz a2, .LBB61_277 +; CHECK-RV32-NEXT: j .LBB61_786 +; CHECK-RV32-NEXT: .LBB61_277: # %else1034 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: beqz a2, .LBB61_278 +; CHECK-RV32-NEXT: j .LBB61_787 +; CHECK-RV32-NEXT: .LBB61_278: # %else1038 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: beqz a2, .LBB61_279 +; CHECK-RV32-NEXT: j .LBB61_788 +; CHECK-RV32-NEXT: .LBB61_279: # %else1042 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: beqz a2, .LBB61_280 +; CHECK-RV32-NEXT: j .LBB61_789 +; CHECK-RV32-NEXT: .LBB61_280: # %else1046 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: beqz a2, .LBB61_281 +; CHECK-RV32-NEXT: j .LBB61_790 +; CHECK-RV32-NEXT: .LBB61_281: # %else1050 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: beqz a2, .LBB61_282 +; CHECK-RV32-NEXT: j .LBB61_791 +; CHECK-RV32-NEXT: .LBB61_282: # %else1054 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: beqz a2, .LBB61_283 +; CHECK-RV32-NEXT: j .LBB61_792 +; CHECK-RV32-NEXT: .LBB61_283: # %else1058 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: beqz a2, .LBB61_284 +; CHECK-RV32-NEXT: j .LBB61_793 +; CHECK-RV32-NEXT: .LBB61_284: # %else1062 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bgez a2, .LBB61_285 +; CHECK-RV32-NEXT: j .LBB61_794 +; CHECK-RV32-NEXT: .LBB61_285: # %else1066 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bgez a2, .LBB61_286 +; CHECK-RV32-NEXT: j .LBB61_795 +; CHECK-RV32-NEXT: .LBB61_286: # %else1070 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bgez a2, .LBB61_287 +; CHECK-RV32-NEXT: j .LBB61_796 +; CHECK-RV32-NEXT: .LBB61_287: # %else1074 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bgez a2, .LBB61_288 +; CHECK-RV32-NEXT: j .LBB61_797 +; CHECK-RV32-NEXT: .LBB61_288: # %else1078 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_289 +; CHECK-RV32-NEXT: j .LBB61_798 +; CHECK-RV32-NEXT: .LBB61_289: # %else1082 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bgez a2, .LBB61_290 +; CHECK-RV32-NEXT: j .LBB61_799 +; CHECK-RV32-NEXT: .LBB61_290: # %else1086 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bgez a2, .LBB61_291 +; CHECK-RV32-NEXT: j .LBB61_800 +; CHECK-RV32-NEXT: .LBB61_291: # %else1090 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bgez a2, .LBB61_292 +; CHECK-RV32-NEXT: j .LBB61_801 +; CHECK-RV32-NEXT: .LBB61_292: # %else1094 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bgez a2, .LBB61_293 +; CHECK-RV32-NEXT: j .LBB61_802 +; CHECK-RV32-NEXT: .LBB61_293: # %else1098 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bgez a2, .LBB61_294 +; CHECK-RV32-NEXT: j .LBB61_803 +; CHECK-RV32-NEXT: .LBB61_294: # %else1102 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bgez a2, .LBB61_295 +; CHECK-RV32-NEXT: j .LBB61_804 +; CHECK-RV32-NEXT: .LBB61_295: # %else1106 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bgez a2, .LBB61_296 +; CHECK-RV32-NEXT: j .LBB61_805 +; CHECK-RV32-NEXT: .LBB61_296: # %else1110 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bgez a2, .LBB61_297 +; CHECK-RV32-NEXT: j .LBB61_806 +; CHECK-RV32-NEXT: .LBB61_297: # %else1114 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bgez a2, .LBB61_298 +; CHECK-RV32-NEXT: j .LBB61_807 +; CHECK-RV32-NEXT: .LBB61_298: # %else1118 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bgez a2, .LBB61_299 +; CHECK-RV32-NEXT: j .LBB61_808 +; CHECK-RV32-NEXT: .LBB61_299: # %else1122 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bgez a2, .LBB61_300 +; CHECK-RV32-NEXT: j .LBB61_809 +; CHECK-RV32-NEXT: .LBB61_300: # %else1126 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bgez a2, .LBB61_301 +; CHECK-RV32-NEXT: j .LBB61_810 +; CHECK-RV32-NEXT: .LBB61_301: # %else1130 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bgez a2, .LBB61_302 +; CHECK-RV32-NEXT: j .LBB61_811 +; CHECK-RV32-NEXT: .LBB61_302: # %else1134 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_304 +; CHECK-RV32-NEXT: .LBB61_303: # %cond.load1137 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 286 +; CHECK-RV32-NEXT: li a4, 285 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_304: # %else1138 +; CHECK-RV32-NEXT: slli a2, a3, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_306 +; CHECK-RV32-NEXT: # %bb.305: # %cond.load1141 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 287 +; CHECK-RV32-NEXT: li a4, 286 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_306: # %else1142 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a2, v16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_307 +; CHECK-RV32-NEXT: j .LBB61_812 +; CHECK-RV32-NEXT: .LBB61_307: # %else1146 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: beqz a3, .LBB61_308 +; CHECK-RV32-NEXT: j .LBB61_813 +; CHECK-RV32-NEXT: .LBB61_308: # %else1150 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: beqz a3, .LBB61_309 +; CHECK-RV32-NEXT: j .LBB61_814 +; CHECK-RV32-NEXT: .LBB61_309: # %else1154 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: beqz a3, .LBB61_310 +; CHECK-RV32-NEXT: j .LBB61_815 +; CHECK-RV32-NEXT: .LBB61_310: # %else1158 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: beqz a3, .LBB61_311 +; CHECK-RV32-NEXT: j .LBB61_816 +; CHECK-RV32-NEXT: .LBB61_311: # %else1162 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: beqz a3, .LBB61_312 +; CHECK-RV32-NEXT: j .LBB61_817 +; CHECK-RV32-NEXT: .LBB61_312: # %else1166 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: beqz a3, .LBB61_313 +; CHECK-RV32-NEXT: j .LBB61_818 +; CHECK-RV32-NEXT: .LBB61_313: # %else1170 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: beqz a3, .LBB61_314 +; CHECK-RV32-NEXT: j .LBB61_819 +; CHECK-RV32-NEXT: .LBB61_314: # %else1174 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: beqz a3, .LBB61_315 +; CHECK-RV32-NEXT: j .LBB61_820 +; CHECK-RV32-NEXT: .LBB61_315: # %else1178 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: beqz a3, .LBB61_316 +; CHECK-RV32-NEXT: j .LBB61_821 +; CHECK-RV32-NEXT: .LBB61_316: # %else1182 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: beqz a3, .LBB61_317 +; CHECK-RV32-NEXT: j .LBB61_822 +; CHECK-RV32-NEXT: .LBB61_317: # %else1186 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: beqz a3, .LBB61_318 +; CHECK-RV32-NEXT: j .LBB61_823 +; CHECK-RV32-NEXT: .LBB61_318: # %else1190 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bgez a3, .LBB61_319 +; CHECK-RV32-NEXT: j .LBB61_824 +; CHECK-RV32-NEXT: .LBB61_319: # %else1194 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bgez a3, .LBB61_320 +; CHECK-RV32-NEXT: j .LBB61_825 +; CHECK-RV32-NEXT: .LBB61_320: # %else1198 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bgez a3, .LBB61_321 +; CHECK-RV32-NEXT: j .LBB61_826 +; CHECK-RV32-NEXT: .LBB61_321: # %else1202 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bgez a3, .LBB61_322 +; CHECK-RV32-NEXT: j .LBB61_827 +; CHECK-RV32-NEXT: .LBB61_322: # %else1206 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_323 +; CHECK-RV32-NEXT: j .LBB61_828 +; CHECK-RV32-NEXT: .LBB61_323: # %else1210 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bgez a3, .LBB61_324 +; CHECK-RV32-NEXT: j .LBB61_829 +; CHECK-RV32-NEXT: .LBB61_324: # %else1214 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bgez a3, .LBB61_325 +; CHECK-RV32-NEXT: j .LBB61_830 +; CHECK-RV32-NEXT: .LBB61_325: # %else1218 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bgez a3, .LBB61_326 +; CHECK-RV32-NEXT: j .LBB61_831 +; CHECK-RV32-NEXT: .LBB61_326: # %else1222 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bgez a3, .LBB61_327 +; CHECK-RV32-NEXT: j .LBB61_832 +; CHECK-RV32-NEXT: .LBB61_327: # %else1226 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bgez a3, .LBB61_328 +; CHECK-RV32-NEXT: j .LBB61_833 +; CHECK-RV32-NEXT: .LBB61_328: # %else1230 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bgez a3, .LBB61_329 +; CHECK-RV32-NEXT: j .LBB61_834 +; CHECK-RV32-NEXT: .LBB61_329: # %else1234 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bgez a3, .LBB61_330 +; CHECK-RV32-NEXT: j .LBB61_835 +; CHECK-RV32-NEXT: .LBB61_330: # %else1238 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bgez a3, .LBB61_331 +; CHECK-RV32-NEXT: j .LBB61_836 +; CHECK-RV32-NEXT: .LBB61_331: # %else1242 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bgez a3, .LBB61_332 +; CHECK-RV32-NEXT: j .LBB61_837 +; CHECK-RV32-NEXT: .LBB61_332: # %else1246 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bgez a3, .LBB61_333 +; CHECK-RV32-NEXT: j .LBB61_838 +; CHECK-RV32-NEXT: .LBB61_333: # %else1250 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bgez a3, .LBB61_334 +; CHECK-RV32-NEXT: j .LBB61_839 +; CHECK-RV32-NEXT: .LBB61_334: # %else1254 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bgez a3, .LBB61_335 +; CHECK-RV32-NEXT: j .LBB61_840 +; CHECK-RV32-NEXT: .LBB61_335: # %else1258 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bgez a3, .LBB61_336 +; CHECK-RV32-NEXT: j .LBB61_841 +; CHECK-RV32-NEXT: .LBB61_336: # %else1262 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_338 +; CHECK-RV32-NEXT: .LBB61_337: # %cond.load1265 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 318 +; CHECK-RV32-NEXT: li a4, 317 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_338: # %else1266 +; CHECK-RV32-NEXT: slli a3, a2, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 5 +; CHECK-RV32-NEXT: bgez a3, .LBB61_340 +; CHECK-RV32-NEXT: # %bb.339: # %cond.load1269 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: li a3, 319 +; CHECK-RV32-NEXT: li a4, 318 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_340: # %else1270 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_341 +; CHECK-RV32-NEXT: j .LBB61_842 +; CHECK-RV32-NEXT: .LBB61_341: # %else1274 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: beqz a2, .LBB61_342 +; CHECK-RV32-NEXT: j .LBB61_843 +; CHECK-RV32-NEXT: .LBB61_342: # %else1278 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: beqz a2, .LBB61_343 +; CHECK-RV32-NEXT: j .LBB61_844 +; CHECK-RV32-NEXT: .LBB61_343: # %else1282 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: beqz a2, .LBB61_344 +; CHECK-RV32-NEXT: j .LBB61_845 +; CHECK-RV32-NEXT: .LBB61_344: # %else1286 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: beqz a2, .LBB61_345 +; CHECK-RV32-NEXT: j .LBB61_846 +; CHECK-RV32-NEXT: .LBB61_345: # %else1290 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: beqz a2, .LBB61_346 +; CHECK-RV32-NEXT: j .LBB61_847 +; CHECK-RV32-NEXT: .LBB61_346: # %else1294 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: beqz a2, .LBB61_347 +; CHECK-RV32-NEXT: j .LBB61_848 +; CHECK-RV32-NEXT: .LBB61_347: # %else1298 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: beqz a2, .LBB61_348 +; CHECK-RV32-NEXT: j .LBB61_849 +; CHECK-RV32-NEXT: .LBB61_348: # %else1302 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: beqz a2, .LBB61_349 +; CHECK-RV32-NEXT: j .LBB61_850 +; CHECK-RV32-NEXT: .LBB61_349: # %else1306 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: beqz a2, .LBB61_350 +; CHECK-RV32-NEXT: j .LBB61_851 +; CHECK-RV32-NEXT: .LBB61_350: # %else1310 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: beqz a2, .LBB61_351 +; CHECK-RV32-NEXT: j .LBB61_852 +; CHECK-RV32-NEXT: .LBB61_351: # %else1314 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: beqz a2, .LBB61_352 +; CHECK-RV32-NEXT: j .LBB61_853 +; CHECK-RV32-NEXT: .LBB61_352: # %else1318 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bgez a2, .LBB61_353 +; CHECK-RV32-NEXT: j .LBB61_854 +; CHECK-RV32-NEXT: .LBB61_353: # %else1322 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bgez a2, .LBB61_354 +; CHECK-RV32-NEXT: j .LBB61_855 +; CHECK-RV32-NEXT: .LBB61_354: # %else1326 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bgez a2, .LBB61_355 +; CHECK-RV32-NEXT: j .LBB61_856 +; CHECK-RV32-NEXT: .LBB61_355: # %else1330 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bgez a2, .LBB61_356 +; CHECK-RV32-NEXT: j .LBB61_857 +; CHECK-RV32-NEXT: .LBB61_356: # %else1334 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_357 +; CHECK-RV32-NEXT: j .LBB61_858 +; CHECK-RV32-NEXT: .LBB61_357: # %else1338 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bgez a2, .LBB61_358 +; CHECK-RV32-NEXT: j .LBB61_859 +; CHECK-RV32-NEXT: .LBB61_358: # %else1342 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bgez a2, .LBB61_359 +; CHECK-RV32-NEXT: j .LBB61_860 +; CHECK-RV32-NEXT: .LBB61_359: # %else1346 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bgez a2, .LBB61_360 +; CHECK-RV32-NEXT: j .LBB61_861 +; CHECK-RV32-NEXT: .LBB61_360: # %else1350 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bgez a2, .LBB61_361 +; CHECK-RV32-NEXT: j .LBB61_862 +; CHECK-RV32-NEXT: .LBB61_361: # %else1354 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bgez a2, .LBB61_362 +; CHECK-RV32-NEXT: j .LBB61_863 +; CHECK-RV32-NEXT: .LBB61_362: # %else1358 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bgez a2, .LBB61_363 +; CHECK-RV32-NEXT: j .LBB61_864 +; CHECK-RV32-NEXT: .LBB61_363: # %else1362 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bgez a2, .LBB61_364 +; CHECK-RV32-NEXT: j .LBB61_865 +; CHECK-RV32-NEXT: .LBB61_364: # %else1366 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bgez a2, .LBB61_365 +; CHECK-RV32-NEXT: j .LBB61_866 +; CHECK-RV32-NEXT: .LBB61_365: # %else1370 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bgez a2, .LBB61_366 +; CHECK-RV32-NEXT: j .LBB61_867 +; CHECK-RV32-NEXT: .LBB61_366: # %else1374 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bgez a2, .LBB61_367 +; CHECK-RV32-NEXT: j .LBB61_868 +; CHECK-RV32-NEXT: .LBB61_367: # %else1378 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bgez a2, .LBB61_368 +; CHECK-RV32-NEXT: j .LBB61_869 +; CHECK-RV32-NEXT: .LBB61_368: # %else1382 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bgez a2, .LBB61_369 +; CHECK-RV32-NEXT: j .LBB61_870 +; CHECK-RV32-NEXT: .LBB61_369: # %else1386 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bgez a2, .LBB61_370 +; CHECK-RV32-NEXT: j .LBB61_871 +; CHECK-RV32-NEXT: .LBB61_370: # %else1390 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_372 +; CHECK-RV32-NEXT: .LBB61_371: # %cond.load1393 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 350 +; CHECK-RV32-NEXT: li a4, 349 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_372: # %else1394 +; CHECK-RV32-NEXT: slli a2, a3, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_374 +; CHECK-RV32-NEXT: # %bb.373: # %cond.load1397 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 351 +; CHECK-RV32-NEXT: li a4, 350 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_374: # %else1398 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a2, v16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_375 +; CHECK-RV32-NEXT: j .LBB61_872 +; CHECK-RV32-NEXT: .LBB61_375: # %else1402 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: beqz a3, .LBB61_376 +; CHECK-RV32-NEXT: j .LBB61_873 +; CHECK-RV32-NEXT: .LBB61_376: # %else1406 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: beqz a3, .LBB61_377 +; CHECK-RV32-NEXT: j .LBB61_874 +; CHECK-RV32-NEXT: .LBB61_377: # %else1410 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: beqz a3, .LBB61_378 +; CHECK-RV32-NEXT: j .LBB61_875 +; CHECK-RV32-NEXT: .LBB61_378: # %else1414 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: beqz a3, .LBB61_379 +; CHECK-RV32-NEXT: j .LBB61_876 +; CHECK-RV32-NEXT: .LBB61_379: # %else1418 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: beqz a3, .LBB61_380 +; CHECK-RV32-NEXT: j .LBB61_877 +; CHECK-RV32-NEXT: .LBB61_380: # %else1422 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: beqz a3, .LBB61_381 +; CHECK-RV32-NEXT: j .LBB61_878 +; CHECK-RV32-NEXT: .LBB61_381: # %else1426 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: beqz a3, .LBB61_382 +; CHECK-RV32-NEXT: j .LBB61_879 +; CHECK-RV32-NEXT: .LBB61_382: # %else1430 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: beqz a3, .LBB61_383 +; CHECK-RV32-NEXT: j .LBB61_880 +; CHECK-RV32-NEXT: .LBB61_383: # %else1434 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: beqz a3, .LBB61_384 +; CHECK-RV32-NEXT: j .LBB61_881 +; CHECK-RV32-NEXT: .LBB61_384: # %else1438 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: beqz a3, .LBB61_385 +; CHECK-RV32-NEXT: j .LBB61_882 +; CHECK-RV32-NEXT: .LBB61_385: # %else1442 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: beqz a3, .LBB61_386 +; CHECK-RV32-NEXT: j .LBB61_883 +; CHECK-RV32-NEXT: .LBB61_386: # %else1446 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bgez a3, .LBB61_387 +; CHECK-RV32-NEXT: j .LBB61_884 +; CHECK-RV32-NEXT: .LBB61_387: # %else1450 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bgez a3, .LBB61_388 +; CHECK-RV32-NEXT: j .LBB61_885 +; CHECK-RV32-NEXT: .LBB61_388: # %else1454 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bgez a3, .LBB61_389 +; CHECK-RV32-NEXT: j .LBB61_886 +; CHECK-RV32-NEXT: .LBB61_389: # %else1458 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bgez a3, .LBB61_390 +; CHECK-RV32-NEXT: j .LBB61_887 +; CHECK-RV32-NEXT: .LBB61_390: # %else1462 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_391 +; CHECK-RV32-NEXT: j .LBB61_888 +; CHECK-RV32-NEXT: .LBB61_391: # %else1466 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bgez a3, .LBB61_392 +; CHECK-RV32-NEXT: j .LBB61_889 +; CHECK-RV32-NEXT: .LBB61_392: # %else1470 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bgez a3, .LBB61_393 +; CHECK-RV32-NEXT: j .LBB61_890 +; CHECK-RV32-NEXT: .LBB61_393: # %else1474 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bgez a3, .LBB61_394 +; CHECK-RV32-NEXT: j .LBB61_891 +; CHECK-RV32-NEXT: .LBB61_394: # %else1478 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bgez a3, .LBB61_395 +; CHECK-RV32-NEXT: j .LBB61_892 +; CHECK-RV32-NEXT: .LBB61_395: # %else1482 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bgez a3, .LBB61_396 +; CHECK-RV32-NEXT: j .LBB61_893 +; CHECK-RV32-NEXT: .LBB61_396: # %else1486 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bgez a3, .LBB61_397 +; CHECK-RV32-NEXT: j .LBB61_894 +; CHECK-RV32-NEXT: .LBB61_397: # %else1490 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bgez a3, .LBB61_398 +; CHECK-RV32-NEXT: j .LBB61_895 +; CHECK-RV32-NEXT: .LBB61_398: # %else1494 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bgez a3, .LBB61_399 +; CHECK-RV32-NEXT: j .LBB61_896 +; CHECK-RV32-NEXT: .LBB61_399: # %else1498 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bgez a3, .LBB61_400 +; CHECK-RV32-NEXT: j .LBB61_897 +; CHECK-RV32-NEXT: .LBB61_400: # %else1502 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bgez a3, .LBB61_401 +; CHECK-RV32-NEXT: j .LBB61_898 +; CHECK-RV32-NEXT: .LBB61_401: # %else1506 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bgez a3, .LBB61_402 +; CHECK-RV32-NEXT: j .LBB61_899 +; CHECK-RV32-NEXT: .LBB61_402: # %else1510 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bgez a3, .LBB61_403 +; CHECK-RV32-NEXT: j .LBB61_900 +; CHECK-RV32-NEXT: .LBB61_403: # %else1514 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bgez a3, .LBB61_404 +; CHECK-RV32-NEXT: j .LBB61_901 +; CHECK-RV32-NEXT: .LBB61_404: # %else1518 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_406 +; CHECK-RV32-NEXT: .LBB61_405: # %cond.load1521 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 382 +; CHECK-RV32-NEXT: li a4, 381 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_406: # %else1522 +; CHECK-RV32-NEXT: slli a3, a2, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 6 +; CHECK-RV32-NEXT: bgez a3, .LBB61_408 +; CHECK-RV32-NEXT: # %bb.407: # %cond.load1525 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: li a3, 383 +; CHECK-RV32-NEXT: li a4, 382 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_408: # %else1526 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_409 +; CHECK-RV32-NEXT: j .LBB61_902 +; CHECK-RV32-NEXT: .LBB61_409: # %else1530 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: beqz a2, .LBB61_410 +; CHECK-RV32-NEXT: j .LBB61_903 +; CHECK-RV32-NEXT: .LBB61_410: # %else1534 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: beqz a2, .LBB61_411 +; CHECK-RV32-NEXT: j .LBB61_904 +; CHECK-RV32-NEXT: .LBB61_411: # %else1538 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: beqz a2, .LBB61_412 +; CHECK-RV32-NEXT: j .LBB61_905 +; CHECK-RV32-NEXT: .LBB61_412: # %else1542 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: beqz a2, .LBB61_413 +; CHECK-RV32-NEXT: j .LBB61_906 +; CHECK-RV32-NEXT: .LBB61_413: # %else1546 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: beqz a2, .LBB61_414 +; CHECK-RV32-NEXT: j .LBB61_907 +; CHECK-RV32-NEXT: .LBB61_414: # %else1550 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: beqz a2, .LBB61_415 +; CHECK-RV32-NEXT: j .LBB61_908 +; CHECK-RV32-NEXT: .LBB61_415: # %else1554 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: beqz a2, .LBB61_416 +; CHECK-RV32-NEXT: j .LBB61_909 +; CHECK-RV32-NEXT: .LBB61_416: # %else1558 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: beqz a2, .LBB61_417 +; CHECK-RV32-NEXT: j .LBB61_910 +; CHECK-RV32-NEXT: .LBB61_417: # %else1562 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: beqz a2, .LBB61_418 +; CHECK-RV32-NEXT: j .LBB61_911 +; CHECK-RV32-NEXT: .LBB61_418: # %else1566 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: beqz a2, .LBB61_419 +; CHECK-RV32-NEXT: j .LBB61_912 +; CHECK-RV32-NEXT: .LBB61_419: # %else1570 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: beqz a2, .LBB61_420 +; CHECK-RV32-NEXT: j .LBB61_913 +; CHECK-RV32-NEXT: .LBB61_420: # %else1574 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bgez a2, .LBB61_421 +; CHECK-RV32-NEXT: j .LBB61_914 +; CHECK-RV32-NEXT: .LBB61_421: # %else1578 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bgez a2, .LBB61_422 +; CHECK-RV32-NEXT: j .LBB61_915 +; CHECK-RV32-NEXT: .LBB61_422: # %else1582 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bgez a2, .LBB61_423 +; CHECK-RV32-NEXT: j .LBB61_916 +; CHECK-RV32-NEXT: .LBB61_423: # %else1586 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bgez a2, .LBB61_424 +; CHECK-RV32-NEXT: j .LBB61_917 +; CHECK-RV32-NEXT: .LBB61_424: # %else1590 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_425 +; CHECK-RV32-NEXT: j .LBB61_918 +; CHECK-RV32-NEXT: .LBB61_425: # %else1594 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bgez a2, .LBB61_426 +; CHECK-RV32-NEXT: j .LBB61_919 +; CHECK-RV32-NEXT: .LBB61_426: # %else1598 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bgez a2, .LBB61_427 +; CHECK-RV32-NEXT: j .LBB61_920 +; CHECK-RV32-NEXT: .LBB61_427: # %else1602 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bgez a2, .LBB61_428 +; CHECK-RV32-NEXT: j .LBB61_921 +; CHECK-RV32-NEXT: .LBB61_428: # %else1606 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bgez a2, .LBB61_429 +; CHECK-RV32-NEXT: j .LBB61_922 +; CHECK-RV32-NEXT: .LBB61_429: # %else1610 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bgez a2, .LBB61_430 +; CHECK-RV32-NEXT: j .LBB61_923 +; CHECK-RV32-NEXT: .LBB61_430: # %else1614 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bgez a2, .LBB61_431 +; CHECK-RV32-NEXT: j .LBB61_924 +; CHECK-RV32-NEXT: .LBB61_431: # %else1618 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bgez a2, .LBB61_432 +; CHECK-RV32-NEXT: j .LBB61_925 +; CHECK-RV32-NEXT: .LBB61_432: # %else1622 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bgez a2, .LBB61_433 +; CHECK-RV32-NEXT: j .LBB61_926 +; CHECK-RV32-NEXT: .LBB61_433: # %else1626 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bgez a2, .LBB61_434 +; CHECK-RV32-NEXT: j .LBB61_927 +; CHECK-RV32-NEXT: .LBB61_434: # %else1630 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bgez a2, .LBB61_435 +; CHECK-RV32-NEXT: j .LBB61_928 +; CHECK-RV32-NEXT: .LBB61_435: # %else1634 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bgez a2, .LBB61_436 +; CHECK-RV32-NEXT: j .LBB61_929 +; CHECK-RV32-NEXT: .LBB61_436: # %else1638 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bgez a2, .LBB61_437 +; CHECK-RV32-NEXT: j .LBB61_930 +; CHECK-RV32-NEXT: .LBB61_437: # %else1642 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bgez a2, .LBB61_438 +; CHECK-RV32-NEXT: j .LBB61_931 +; CHECK-RV32-NEXT: .LBB61_438: # %else1646 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_440 +; CHECK-RV32-NEXT: .LBB61_439: # %cond.load1649 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 414 +; CHECK-RV32-NEXT: li a4, 413 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_440: # %else1650 +; CHECK-RV32-NEXT: slli a2, a3, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_442 +; CHECK-RV32-NEXT: # %bb.441: # %cond.load1653 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 415 +; CHECK-RV32-NEXT: li a4, 414 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_442: # %else1654 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a2, v16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_443 +; CHECK-RV32-NEXT: j .LBB61_932 +; CHECK-RV32-NEXT: .LBB61_443: # %else1658 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: beqz a3, .LBB61_444 +; CHECK-RV32-NEXT: j .LBB61_933 +; CHECK-RV32-NEXT: .LBB61_444: # %else1662 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: beqz a3, .LBB61_445 +; CHECK-RV32-NEXT: j .LBB61_934 +; CHECK-RV32-NEXT: .LBB61_445: # %else1666 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: beqz a3, .LBB61_446 +; CHECK-RV32-NEXT: j .LBB61_935 +; CHECK-RV32-NEXT: .LBB61_446: # %else1670 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: beqz a3, .LBB61_447 +; CHECK-RV32-NEXT: j .LBB61_936 +; CHECK-RV32-NEXT: .LBB61_447: # %else1674 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: beqz a3, .LBB61_448 +; CHECK-RV32-NEXT: j .LBB61_937 +; CHECK-RV32-NEXT: .LBB61_448: # %else1678 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: beqz a3, .LBB61_449 +; CHECK-RV32-NEXT: j .LBB61_938 +; CHECK-RV32-NEXT: .LBB61_449: # %else1682 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: beqz a3, .LBB61_450 +; CHECK-RV32-NEXT: j .LBB61_939 +; CHECK-RV32-NEXT: .LBB61_450: # %else1686 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: beqz a3, .LBB61_451 +; CHECK-RV32-NEXT: j .LBB61_940 +; CHECK-RV32-NEXT: .LBB61_451: # %else1690 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: beqz a3, .LBB61_452 +; CHECK-RV32-NEXT: j .LBB61_941 +; CHECK-RV32-NEXT: .LBB61_452: # %else1694 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: beqz a3, .LBB61_453 +; CHECK-RV32-NEXT: j .LBB61_942 +; CHECK-RV32-NEXT: .LBB61_453: # %else1698 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: beqz a3, .LBB61_454 +; CHECK-RV32-NEXT: j .LBB61_943 +; CHECK-RV32-NEXT: .LBB61_454: # %else1702 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bgez a3, .LBB61_455 +; CHECK-RV32-NEXT: j .LBB61_944 +; CHECK-RV32-NEXT: .LBB61_455: # %else1706 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bgez a3, .LBB61_456 +; CHECK-RV32-NEXT: j .LBB61_945 +; CHECK-RV32-NEXT: .LBB61_456: # %else1710 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bgez a3, .LBB61_457 +; CHECK-RV32-NEXT: j .LBB61_946 +; CHECK-RV32-NEXT: .LBB61_457: # %else1714 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bgez a3, .LBB61_458 +; CHECK-RV32-NEXT: j .LBB61_947 +; CHECK-RV32-NEXT: .LBB61_458: # %else1718 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_459 +; CHECK-RV32-NEXT: j .LBB61_948 +; CHECK-RV32-NEXT: .LBB61_459: # %else1722 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bgez a3, .LBB61_460 +; CHECK-RV32-NEXT: j .LBB61_949 +; CHECK-RV32-NEXT: .LBB61_460: # %else1726 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bgez a3, .LBB61_461 +; CHECK-RV32-NEXT: j .LBB61_950 +; CHECK-RV32-NEXT: .LBB61_461: # %else1730 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bgez a3, .LBB61_462 +; CHECK-RV32-NEXT: j .LBB61_951 +; CHECK-RV32-NEXT: .LBB61_462: # %else1734 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bgez a3, .LBB61_463 +; CHECK-RV32-NEXT: j .LBB61_952 +; CHECK-RV32-NEXT: .LBB61_463: # %else1738 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bgez a3, .LBB61_464 +; CHECK-RV32-NEXT: j .LBB61_953 +; CHECK-RV32-NEXT: .LBB61_464: # %else1742 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bgez a3, .LBB61_465 +; CHECK-RV32-NEXT: j .LBB61_954 +; CHECK-RV32-NEXT: .LBB61_465: # %else1746 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bgez a3, .LBB61_466 +; CHECK-RV32-NEXT: j .LBB61_955 +; CHECK-RV32-NEXT: .LBB61_466: # %else1750 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bgez a3, .LBB61_467 +; CHECK-RV32-NEXT: j .LBB61_956 +; CHECK-RV32-NEXT: .LBB61_467: # %else1754 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bgez a3, .LBB61_468 +; CHECK-RV32-NEXT: j .LBB61_957 +; CHECK-RV32-NEXT: .LBB61_468: # %else1758 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bgez a3, .LBB61_469 +; CHECK-RV32-NEXT: j .LBB61_958 +; CHECK-RV32-NEXT: .LBB61_469: # %else1762 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bgez a3, .LBB61_470 +; CHECK-RV32-NEXT: j .LBB61_959 +; CHECK-RV32-NEXT: .LBB61_470: # %else1766 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bgez a3, .LBB61_471 +; CHECK-RV32-NEXT: j .LBB61_960 +; CHECK-RV32-NEXT: .LBB61_471: # %else1770 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bgez a3, .LBB61_472 +; CHECK-RV32-NEXT: j .LBB61_961 +; CHECK-RV32-NEXT: .LBB61_472: # %else1774 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_474 +; CHECK-RV32-NEXT: .LBB61_473: # %cond.load1777 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 446 +; CHECK-RV32-NEXT: li a4, 445 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_474: # %else1778 +; CHECK-RV32-NEXT: slli a3, a2, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vslidedown.vi v16, v0, 7 +; CHECK-RV32-NEXT: bgez a3, .LBB61_476 +; CHECK-RV32-NEXT: # %bb.475: # %cond.load1781 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: li a3, 447 +; CHECK-RV32-NEXT: li a4, 446 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_476: # %else1782 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a3, v16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_477 +; CHECK-RV32-NEXT: j .LBB61_962 +; CHECK-RV32-NEXT: .LBB61_477: # %else1786 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: beqz a2, .LBB61_478 +; CHECK-RV32-NEXT: j .LBB61_963 +; CHECK-RV32-NEXT: .LBB61_478: # %else1790 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: beqz a2, .LBB61_479 +; CHECK-RV32-NEXT: j .LBB61_964 +; CHECK-RV32-NEXT: .LBB61_479: # %else1794 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: beqz a2, .LBB61_480 +; CHECK-RV32-NEXT: j .LBB61_965 +; CHECK-RV32-NEXT: .LBB61_480: # %else1798 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: beqz a2, .LBB61_481 +; CHECK-RV32-NEXT: j .LBB61_966 +; CHECK-RV32-NEXT: .LBB61_481: # %else1802 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: beqz a2, .LBB61_482 +; CHECK-RV32-NEXT: j .LBB61_967 +; CHECK-RV32-NEXT: .LBB61_482: # %else1806 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: beqz a2, .LBB61_483 +; CHECK-RV32-NEXT: j .LBB61_968 +; CHECK-RV32-NEXT: .LBB61_483: # %else1810 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: beqz a2, .LBB61_484 +; CHECK-RV32-NEXT: j .LBB61_969 +; CHECK-RV32-NEXT: .LBB61_484: # %else1814 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: beqz a2, .LBB61_485 +; CHECK-RV32-NEXT: j .LBB61_970 +; CHECK-RV32-NEXT: .LBB61_485: # %else1818 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: beqz a2, .LBB61_486 +; CHECK-RV32-NEXT: j .LBB61_971 +; CHECK-RV32-NEXT: .LBB61_486: # %else1822 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: beqz a2, .LBB61_487 +; CHECK-RV32-NEXT: j .LBB61_972 +; CHECK-RV32-NEXT: .LBB61_487: # %else1826 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: beqz a2, .LBB61_488 +; CHECK-RV32-NEXT: j .LBB61_973 +; CHECK-RV32-NEXT: .LBB61_488: # %else1830 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bgez a2, .LBB61_489 +; CHECK-RV32-NEXT: j .LBB61_974 +; CHECK-RV32-NEXT: .LBB61_489: # %else1834 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bgez a2, .LBB61_490 +; CHECK-RV32-NEXT: j .LBB61_975 +; CHECK-RV32-NEXT: .LBB61_490: # %else1838 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bgez a2, .LBB61_491 +; CHECK-RV32-NEXT: j .LBB61_976 +; CHECK-RV32-NEXT: .LBB61_491: # %else1842 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bgez a2, .LBB61_492 +; CHECK-RV32-NEXT: j .LBB61_977 +; CHECK-RV32-NEXT: .LBB61_492: # %else1846 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_493 +; CHECK-RV32-NEXT: j .LBB61_978 +; CHECK-RV32-NEXT: .LBB61_493: # %else1850 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bgez a2, .LBB61_494 +; CHECK-RV32-NEXT: j .LBB61_979 +; CHECK-RV32-NEXT: .LBB61_494: # %else1854 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bgez a2, .LBB61_495 +; CHECK-RV32-NEXT: j .LBB61_980 +; CHECK-RV32-NEXT: .LBB61_495: # %else1858 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bgez a2, .LBB61_496 +; CHECK-RV32-NEXT: j .LBB61_981 +; CHECK-RV32-NEXT: .LBB61_496: # %else1862 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bgez a2, .LBB61_497 +; CHECK-RV32-NEXT: j .LBB61_982 +; CHECK-RV32-NEXT: .LBB61_497: # %else1866 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bgez a2, .LBB61_498 +; CHECK-RV32-NEXT: j .LBB61_983 +; CHECK-RV32-NEXT: .LBB61_498: # %else1870 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bgez a2, .LBB61_499 +; CHECK-RV32-NEXT: j .LBB61_984 +; CHECK-RV32-NEXT: .LBB61_499: # %else1874 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bgez a2, .LBB61_500 +; CHECK-RV32-NEXT: j .LBB61_985 +; CHECK-RV32-NEXT: .LBB61_500: # %else1878 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bgez a2, .LBB61_501 +; CHECK-RV32-NEXT: j .LBB61_986 +; CHECK-RV32-NEXT: .LBB61_501: # %else1882 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bgez a2, .LBB61_502 +; CHECK-RV32-NEXT: j .LBB61_987 +; CHECK-RV32-NEXT: .LBB61_502: # %else1886 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bgez a2, .LBB61_503 +; CHECK-RV32-NEXT: j .LBB61_988 +; CHECK-RV32-NEXT: .LBB61_503: # %else1890 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bgez a2, .LBB61_504 +; CHECK-RV32-NEXT: j .LBB61_989 +; CHECK-RV32-NEXT: .LBB61_504: # %else1894 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bgez a2, .LBB61_505 +; CHECK-RV32-NEXT: j .LBB61_990 +; CHECK-RV32-NEXT: .LBB61_505: # %else1898 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bgez a2, .LBB61_506 +; CHECK-RV32-NEXT: j .LBB61_991 +; CHECK-RV32-NEXT: .LBB61_506: # %else1902 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_508 +; CHECK-RV32-NEXT: .LBB61_507: # %cond.load1905 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 478 +; CHECK-RV32-NEXT: li a4, 477 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_508: # %else1906 +; CHECK-RV32-NEXT: slli a2, a3, 1 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vsrl.vx v16, v16, a1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_510 +; CHECK-RV32-NEXT: # %bb.509: # %cond.load1909 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a1 +; CHECK-RV32-NEXT: li a1, 479 +; CHECK-RV32-NEXT: li a2, 478 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a2 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: .LBB61_510: # %else1910 +; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV32-NEXT: vmv.x.s a1, v16 +; CHECK-RV32-NEXT: bgez a3, .LBB61_511 +; CHECK-RV32-NEXT: j .LBB61_992 +; CHECK-RV32-NEXT: .LBB61_511: # %else1914 +; CHECK-RV32-NEXT: andi a2, a1, 1 +; CHECK-RV32-NEXT: beqz a2, .LBB61_512 +; CHECK-RV32-NEXT: j .LBB61_993 +; CHECK-RV32-NEXT: .LBB61_512: # %else1918 +; CHECK-RV32-NEXT: andi a2, a1, 2 +; CHECK-RV32-NEXT: beqz a2, .LBB61_513 +; CHECK-RV32-NEXT: j .LBB61_994 +; CHECK-RV32-NEXT: .LBB61_513: # %else1922 +; CHECK-RV32-NEXT: andi a2, a1, 4 +; CHECK-RV32-NEXT: beqz a2, .LBB61_514 +; CHECK-RV32-NEXT: j .LBB61_995 +; CHECK-RV32-NEXT: .LBB61_514: # %else1926 +; CHECK-RV32-NEXT: andi a2, a1, 8 +; CHECK-RV32-NEXT: beqz a2, .LBB61_515 +; CHECK-RV32-NEXT: j .LBB61_996 +; CHECK-RV32-NEXT: .LBB61_515: # %else1930 +; CHECK-RV32-NEXT: andi a2, a1, 16 +; CHECK-RV32-NEXT: beqz a2, .LBB61_516 +; CHECK-RV32-NEXT: j .LBB61_997 +; CHECK-RV32-NEXT: .LBB61_516: # %else1934 +; CHECK-RV32-NEXT: andi a2, a1, 32 +; CHECK-RV32-NEXT: beqz a2, .LBB61_517 +; CHECK-RV32-NEXT: j .LBB61_998 +; CHECK-RV32-NEXT: .LBB61_517: # %else1938 +; CHECK-RV32-NEXT: andi a2, a1, 64 +; CHECK-RV32-NEXT: beqz a2, .LBB61_518 +; CHECK-RV32-NEXT: j .LBB61_999 +; CHECK-RV32-NEXT: .LBB61_518: # %else1942 +; CHECK-RV32-NEXT: andi a2, a1, 128 +; CHECK-RV32-NEXT: beqz a2, .LBB61_519 +; CHECK-RV32-NEXT: j .LBB61_1000 +; CHECK-RV32-NEXT: .LBB61_519: # %else1946 +; CHECK-RV32-NEXT: andi a2, a1, 256 +; CHECK-RV32-NEXT: beqz a2, .LBB61_520 +; CHECK-RV32-NEXT: j .LBB61_1001 +; CHECK-RV32-NEXT: .LBB61_520: # %else1950 +; CHECK-RV32-NEXT: andi a2, a1, 512 +; CHECK-RV32-NEXT: beqz a2, .LBB61_521 +; CHECK-RV32-NEXT: j .LBB61_1002 +; CHECK-RV32-NEXT: .LBB61_521: # %else1954 +; CHECK-RV32-NEXT: andi a2, a1, 1024 +; CHECK-RV32-NEXT: beqz a2, .LBB61_522 +; CHECK-RV32-NEXT: j .LBB61_1003 +; CHECK-RV32-NEXT: .LBB61_522: # %else1958 +; CHECK-RV32-NEXT: slli a2, a1, 20 +; CHECK-RV32-NEXT: bgez a2, .LBB61_523 +; CHECK-RV32-NEXT: j .LBB61_1004 +; CHECK-RV32-NEXT: .LBB61_523: # %else1962 +; CHECK-RV32-NEXT: slli a2, a1, 19 +; CHECK-RV32-NEXT: bgez a2, .LBB61_524 +; CHECK-RV32-NEXT: j .LBB61_1005 +; CHECK-RV32-NEXT: .LBB61_524: # %else1966 +; CHECK-RV32-NEXT: slli a2, a1, 18 +; CHECK-RV32-NEXT: bgez a2, .LBB61_525 +; CHECK-RV32-NEXT: j .LBB61_1006 +; CHECK-RV32-NEXT: .LBB61_525: # %else1970 +; CHECK-RV32-NEXT: slli a2, a1, 17 +; CHECK-RV32-NEXT: bgez a2, .LBB61_526 +; CHECK-RV32-NEXT: j .LBB61_1007 +; CHECK-RV32-NEXT: .LBB61_526: # %else1974 +; CHECK-RV32-NEXT: slli a2, a1, 16 +; CHECK-RV32-NEXT: bgez a2, .LBB61_527 +; CHECK-RV32-NEXT: j .LBB61_1008 +; CHECK-RV32-NEXT: .LBB61_527: # %else1978 +; CHECK-RV32-NEXT: slli a2, a1, 15 +; CHECK-RV32-NEXT: bgez a2, .LBB61_528 +; CHECK-RV32-NEXT: j .LBB61_1009 +; CHECK-RV32-NEXT: .LBB61_528: # %else1982 +; CHECK-RV32-NEXT: slli a2, a1, 14 +; CHECK-RV32-NEXT: bgez a2, .LBB61_529 +; CHECK-RV32-NEXT: j .LBB61_1010 +; CHECK-RV32-NEXT: .LBB61_529: # %else1986 +; CHECK-RV32-NEXT: slli a2, a1, 13 +; CHECK-RV32-NEXT: bgez a2, .LBB61_530 +; CHECK-RV32-NEXT: j .LBB61_1011 +; CHECK-RV32-NEXT: .LBB61_530: # %else1990 +; CHECK-RV32-NEXT: slli a2, a1, 12 +; CHECK-RV32-NEXT: bgez a2, .LBB61_531 +; CHECK-RV32-NEXT: j .LBB61_1012 +; CHECK-RV32-NEXT: .LBB61_531: # %else1994 +; CHECK-RV32-NEXT: slli a2, a1, 11 +; CHECK-RV32-NEXT: bgez a2, .LBB61_532 +; CHECK-RV32-NEXT: j .LBB61_1013 +; CHECK-RV32-NEXT: .LBB61_532: # %else1998 +; CHECK-RV32-NEXT: slli a2, a1, 10 +; CHECK-RV32-NEXT: bgez a2, .LBB61_533 +; CHECK-RV32-NEXT: j .LBB61_1014 +; CHECK-RV32-NEXT: .LBB61_533: # %else2002 +; CHECK-RV32-NEXT: slli a2, a1, 9 +; CHECK-RV32-NEXT: bgez a2, .LBB61_534 +; CHECK-RV32-NEXT: j .LBB61_1015 +; CHECK-RV32-NEXT: .LBB61_534: # %else2006 +; CHECK-RV32-NEXT: slli a2, a1, 8 +; CHECK-RV32-NEXT: bgez a2, .LBB61_535 +; CHECK-RV32-NEXT: j .LBB61_1016 +; CHECK-RV32-NEXT: .LBB61_535: # %else2010 +; CHECK-RV32-NEXT: slli a2, a1, 7 +; CHECK-RV32-NEXT: bgez a2, .LBB61_536 +; CHECK-RV32-NEXT: j .LBB61_1017 +; CHECK-RV32-NEXT: .LBB61_536: # %else2014 +; CHECK-RV32-NEXT: slli a2, a1, 6 +; CHECK-RV32-NEXT: bgez a2, .LBB61_537 +; CHECK-RV32-NEXT: j .LBB61_1018 +; CHECK-RV32-NEXT: .LBB61_537: # %else2018 +; CHECK-RV32-NEXT: slli a2, a1, 5 +; CHECK-RV32-NEXT: bgez a2, .LBB61_538 +; CHECK-RV32-NEXT: j .LBB61_1019 +; CHECK-RV32-NEXT: .LBB61_538: # %else2022 +; CHECK-RV32-NEXT: slli a2, a1, 4 +; CHECK-RV32-NEXT: bgez a2, .LBB61_539 +; CHECK-RV32-NEXT: j .LBB61_1020 +; CHECK-RV32-NEXT: .LBB61_539: # %else2026 +; CHECK-RV32-NEXT: slli a2, a1, 3 +; CHECK-RV32-NEXT: bgez a2, .LBB61_540 +; CHECK-RV32-NEXT: j .LBB61_1021 +; CHECK-RV32-NEXT: .LBB61_540: # %else2030 +; CHECK-RV32-NEXT: slli a2, a1, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_541 +; CHECK-RV32-NEXT: j .LBB61_1022 +; CHECK-RV32-NEXT: .LBB61_541: # %else2034 +; CHECK-RV32-NEXT: slli a2, a1, 1 +; CHECK-RV32-NEXT: bgez a2, .LBB61_542 +; CHECK-RV32-NEXT: j .LBB61_1023 +; CHECK-RV32-NEXT: .LBB61_542: # %else2038 +; CHECK-RV32-NEXT: bgez a1, .LBB61_543 +; CHECK-RV32-NEXT: j .LBB61_1024 +; CHECK-RV32-NEXT: .LBB61_543: # %else2042 +; CHECK-RV32-NEXT: ret +; CHECK-RV32-NEXT: .LBB61_544: # %cond.load +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v8, a1 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a1, a3, 2 +; CHECK-RV32-NEXT: bnez a1, .LBB61_545 +; CHECK-RV32-NEXT: j .LBB61_2 +; CHECK-RV32-NEXT: .LBB61_545: # %cond.load1 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 1 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 4 +; CHECK-RV32-NEXT: bnez a1, .LBB61_546 +; CHECK-RV32-NEXT: j .LBB61_3 +; CHECK-RV32-NEXT: .LBB61_546: # %cond.load5 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 2 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 8 +; CHECK-RV32-NEXT: bnez a1, .LBB61_547 +; CHECK-RV32-NEXT: j .LBB61_4 +; CHECK-RV32-NEXT: .LBB61_547: # %cond.load9 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 4, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 16 +; CHECK-RV32-NEXT: bnez a1, .LBB61_548 +; CHECK-RV32-NEXT: j .LBB61_5 +; CHECK-RV32-NEXT: .LBB61_548: # %cond.load13 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 5, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 32 +; CHECK-RV32-NEXT: bnez a1, .LBB61_549 +; CHECK-RV32-NEXT: j .LBB61_6 +; CHECK-RV32-NEXT: .LBB61_549: # %cond.load17 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 6, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 5 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 64 +; CHECK-RV32-NEXT: bnez a1, .LBB61_550 +; CHECK-RV32-NEXT: j .LBB61_7 +; CHECK-RV32-NEXT: .LBB61_550: # %cond.load21 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 7, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 6 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 128 +; CHECK-RV32-NEXT: bnez a1, .LBB61_551 +; CHECK-RV32-NEXT: j .LBB61_8 +; CHECK-RV32-NEXT: .LBB61_551: # %cond.load25 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 7 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 256 +; CHECK-RV32-NEXT: bnez a1, .LBB61_552 +; CHECK-RV32-NEXT: j .LBB61_9 +; CHECK-RV32-NEXT: .LBB61_552: # %cond.load29 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 9, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 8 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 512 +; CHECK-RV32-NEXT: bnez a1, .LBB61_553 +; CHECK-RV32-NEXT: j .LBB61_10 +; CHECK-RV32-NEXT: .LBB61_553: # %cond.load33 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 10, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 9 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a1, a3, 1024 +; CHECK-RV32-NEXT: bnez a1, .LBB61_554 +; CHECK-RV32-NEXT: j .LBB61_11 +; CHECK-RV32-NEXT: .LBB61_554: # %cond.load37 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 11, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 10 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 20 +; CHECK-RV32-NEXT: bltz a1, .LBB61_555 +; CHECK-RV32-NEXT: j .LBB61_12 +; CHECK-RV32-NEXT: .LBB61_555: # %cond.load41 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 12, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 11 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 19 +; CHECK-RV32-NEXT: bltz a1, .LBB61_556 +; CHECK-RV32-NEXT: j .LBB61_13 +; CHECK-RV32-NEXT: .LBB61_556: # %cond.load45 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 13, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 12 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 18 +; CHECK-RV32-NEXT: bltz a1, .LBB61_557 +; CHECK-RV32-NEXT: j .LBB61_14 +; CHECK-RV32-NEXT: .LBB61_557: # %cond.load49 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 14, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 13 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 17 +; CHECK-RV32-NEXT: bltz a1, .LBB61_558 +; CHECK-RV32-NEXT: j .LBB61_15 +; CHECK-RV32-NEXT: .LBB61_558: # %cond.load53 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 15, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 14 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 16 +; CHECK-RV32-NEXT: bltz a1, .LBB61_559 +; CHECK-RV32-NEXT: j .LBB61_16 +; CHECK-RV32-NEXT: .LBB61_559: # %cond.load57 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 16, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 15 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 15 +; CHECK-RV32-NEXT: bltz a1, .LBB61_560 +; CHECK-RV32-NEXT: j .LBB61_17 +; CHECK-RV32-NEXT: .LBB61_560: # %cond.load61 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 17, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 16 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 14 +; CHECK-RV32-NEXT: bltz a1, .LBB61_561 +; CHECK-RV32-NEXT: j .LBB61_18 +; CHECK-RV32-NEXT: .LBB61_561: # %cond.load65 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 18, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 17 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 13 +; CHECK-RV32-NEXT: bltz a1, .LBB61_562 +; CHECK-RV32-NEXT: j .LBB61_19 +; CHECK-RV32-NEXT: .LBB61_562: # %cond.load69 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 19, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 18 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 12 +; CHECK-RV32-NEXT: bltz a1, .LBB61_563 +; CHECK-RV32-NEXT: j .LBB61_20 +; CHECK-RV32-NEXT: .LBB61_563: # %cond.load73 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 20, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 19 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 11 +; CHECK-RV32-NEXT: bltz a1, .LBB61_564 +; CHECK-RV32-NEXT: j .LBB61_21 +; CHECK-RV32-NEXT: .LBB61_564: # %cond.load77 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 21, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 20 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 10 +; CHECK-RV32-NEXT: bltz a1, .LBB61_565 +; CHECK-RV32-NEXT: j .LBB61_22 +; CHECK-RV32-NEXT: .LBB61_565: # %cond.load81 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 22, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 21 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 9 +; CHECK-RV32-NEXT: bltz a1, .LBB61_566 +; CHECK-RV32-NEXT: j .LBB61_23 +; CHECK-RV32-NEXT: .LBB61_566: # %cond.load85 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 23, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 22 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 8 +; CHECK-RV32-NEXT: bltz a1, .LBB61_567 +; CHECK-RV32-NEXT: j .LBB61_24 +; CHECK-RV32-NEXT: .LBB61_567: # %cond.load89 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 24, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 23 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 7 +; CHECK-RV32-NEXT: bltz a1, .LBB61_568 +; CHECK-RV32-NEXT: j .LBB61_25 +; CHECK-RV32-NEXT: .LBB61_568: # %cond.load93 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 25, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 24 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 6 +; CHECK-RV32-NEXT: bltz a1, .LBB61_569 +; CHECK-RV32-NEXT: j .LBB61_26 +; CHECK-RV32-NEXT: .LBB61_569: # %cond.load97 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 26, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 25 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 5 +; CHECK-RV32-NEXT: bltz a1, .LBB61_570 +; CHECK-RV32-NEXT: j .LBB61_27 +; CHECK-RV32-NEXT: .LBB61_570: # %cond.load101 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 27, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 26 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 4 +; CHECK-RV32-NEXT: bltz a1, .LBB61_571 +; CHECK-RV32-NEXT: j .LBB61_28 +; CHECK-RV32-NEXT: .LBB61_571: # %cond.load105 +; CHECK-RV32-NEXT: lbu a1, 0(a0) +; CHECK-RV32-NEXT: vsetivli zero, 28, e8, m1, tu, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a1 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: vslideup.vi v8, v16, 27 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a1, a3, 3 +; CHECK-RV32-NEXT: bgez a1, .LBB61_1025 +; CHECK-RV32-NEXT: j .LBB61_29 +; CHECK-RV32-NEXT: .LBB61_1025: # %cond.load105 +; CHECK-RV32-NEXT: j .LBB61_30 +; CHECK-RV32-NEXT: .LBB61_572: # %cond.load121 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 32 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vi v8, v24, 31 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: bnez a3, .LBB61_573 +; CHECK-RV32-NEXT: j .LBB61_36 +; CHECK-RV32-NEXT: .LBB61_573: # %cond.load125 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 33 +; CHECK-RV32-NEXT: li a4, 32 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: bnez a3, .LBB61_574 +; CHECK-RV32-NEXT: j .LBB61_37 +; CHECK-RV32-NEXT: .LBB61_574: # %cond.load129 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 34 +; CHECK-RV32-NEXT: li a4, 33 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: bnez a3, .LBB61_575 +; CHECK-RV32-NEXT: j .LBB61_38 +; CHECK-RV32-NEXT: .LBB61_575: # %cond.load133 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 35 +; CHECK-RV32-NEXT: li a4, 34 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: bnez a3, .LBB61_576 +; CHECK-RV32-NEXT: j .LBB61_39 +; CHECK-RV32-NEXT: .LBB61_576: # %cond.load137 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 36 +; CHECK-RV32-NEXT: li a4, 35 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: bnez a3, .LBB61_577 +; CHECK-RV32-NEXT: j .LBB61_40 +; CHECK-RV32-NEXT: .LBB61_577: # %cond.load141 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 37 +; CHECK-RV32-NEXT: li a4, 36 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: bnez a3, .LBB61_578 +; CHECK-RV32-NEXT: j .LBB61_41 +; CHECK-RV32-NEXT: .LBB61_578: # %cond.load145 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 38 +; CHECK-RV32-NEXT: li a4, 37 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: bnez a3, .LBB61_579 +; CHECK-RV32-NEXT: j .LBB61_42 +; CHECK-RV32-NEXT: .LBB61_579: # %cond.load149 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 39 +; CHECK-RV32-NEXT: li a4, 38 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: bnez a3, .LBB61_580 +; CHECK-RV32-NEXT: j .LBB61_43 +; CHECK-RV32-NEXT: .LBB61_580: # %cond.load153 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 40 +; CHECK-RV32-NEXT: li a4, 39 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: bnez a3, .LBB61_581 +; CHECK-RV32-NEXT: j .LBB61_44 +; CHECK-RV32-NEXT: .LBB61_581: # %cond.load157 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 41 +; CHECK-RV32-NEXT: li a4, 40 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: bnez a3, .LBB61_582 +; CHECK-RV32-NEXT: j .LBB61_45 +; CHECK-RV32-NEXT: .LBB61_582: # %cond.load161 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 42 +; CHECK-RV32-NEXT: li a4, 41 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: bnez a3, .LBB61_583 +; CHECK-RV32-NEXT: j .LBB61_46 +; CHECK-RV32-NEXT: .LBB61_583: # %cond.load165 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 43 +; CHECK-RV32-NEXT: li a4, 42 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bltz a3, .LBB61_584 +; CHECK-RV32-NEXT: j .LBB61_47 +; CHECK-RV32-NEXT: .LBB61_584: # %cond.load169 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 44 +; CHECK-RV32-NEXT: li a4, 43 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bltz a3, .LBB61_585 +; CHECK-RV32-NEXT: j .LBB61_48 +; CHECK-RV32-NEXT: .LBB61_585: # %cond.load173 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 45 +; CHECK-RV32-NEXT: li a4, 44 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bltz a3, .LBB61_586 +; CHECK-RV32-NEXT: j .LBB61_49 +; CHECK-RV32-NEXT: .LBB61_586: # %cond.load177 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 46 +; CHECK-RV32-NEXT: li a4, 45 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bltz a3, .LBB61_587 +; CHECK-RV32-NEXT: j .LBB61_50 +; CHECK-RV32-NEXT: .LBB61_587: # %cond.load181 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 47 +; CHECK-RV32-NEXT: li a4, 46 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bltz a3, .LBB61_588 +; CHECK-RV32-NEXT: j .LBB61_51 +; CHECK-RV32-NEXT: .LBB61_588: # %cond.load185 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 48 +; CHECK-RV32-NEXT: li a4, 47 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bltz a3, .LBB61_589 +; CHECK-RV32-NEXT: j .LBB61_52 +; CHECK-RV32-NEXT: .LBB61_589: # %cond.load189 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 49 +; CHECK-RV32-NEXT: li a4, 48 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bltz a3, .LBB61_590 +; CHECK-RV32-NEXT: j .LBB61_53 +; CHECK-RV32-NEXT: .LBB61_590: # %cond.load193 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 50 +; CHECK-RV32-NEXT: li a4, 49 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bltz a3, .LBB61_591 +; CHECK-RV32-NEXT: j .LBB61_54 +; CHECK-RV32-NEXT: .LBB61_591: # %cond.load197 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 51 +; CHECK-RV32-NEXT: li a4, 50 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bltz a3, .LBB61_592 +; CHECK-RV32-NEXT: j .LBB61_55 +; CHECK-RV32-NEXT: .LBB61_592: # %cond.load201 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 52 +; CHECK-RV32-NEXT: li a4, 51 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bltz a3, .LBB61_593 +; CHECK-RV32-NEXT: j .LBB61_56 +; CHECK-RV32-NEXT: .LBB61_593: # %cond.load205 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 53 +; CHECK-RV32-NEXT: li a4, 52 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bltz a3, .LBB61_594 +; CHECK-RV32-NEXT: j .LBB61_57 +; CHECK-RV32-NEXT: .LBB61_594: # %cond.load209 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 54 +; CHECK-RV32-NEXT: li a4, 53 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bltz a3, .LBB61_595 +; CHECK-RV32-NEXT: j .LBB61_58 +; CHECK-RV32-NEXT: .LBB61_595: # %cond.load213 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 55 +; CHECK-RV32-NEXT: li a4, 54 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bltz a3, .LBB61_596 +; CHECK-RV32-NEXT: j .LBB61_59 +; CHECK-RV32-NEXT: .LBB61_596: # %cond.load217 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 56 +; CHECK-RV32-NEXT: li a4, 55 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bltz a3, .LBB61_597 +; CHECK-RV32-NEXT: j .LBB61_60 +; CHECK-RV32-NEXT: .LBB61_597: # %cond.load221 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 57 +; CHECK-RV32-NEXT: li a4, 56 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bltz a3, .LBB61_598 +; CHECK-RV32-NEXT: j .LBB61_61 +; CHECK-RV32-NEXT: .LBB61_598: # %cond.load225 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 58 +; CHECK-RV32-NEXT: li a4, 57 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bltz a3, .LBB61_599 +; CHECK-RV32-NEXT: j .LBB61_62 +; CHECK-RV32-NEXT: .LBB61_599: # %cond.load229 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 59 +; CHECK-RV32-NEXT: li a4, 58 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bltz a3, .LBB61_600 +; CHECK-RV32-NEXT: j .LBB61_63 +; CHECK-RV32-NEXT: .LBB61_600: # %cond.load233 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 60 +; CHECK-RV32-NEXT: li a4, 59 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bltz a3, .LBB61_601 +; CHECK-RV32-NEXT: j .LBB61_64 +; CHECK-RV32-NEXT: .LBB61_601: # %cond.load237 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 61 +; CHECK-RV32-NEXT: li a4, 60 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_1026 +; CHECK-RV32-NEXT: j .LBB61_65 +; CHECK-RV32-NEXT: .LBB61_1026: # %cond.load237 +; CHECK-RV32-NEXT: j .LBB61_66 +; CHECK-RV32-NEXT: .LBB61_602: # %cond.load249 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v17, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 64 +; CHECK-RV32-NEXT: li a4, 63 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m1, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v17, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv1r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: bnez a2, .LBB61_603 +; CHECK-RV32-NEXT: j .LBB61_70 +; CHECK-RV32-NEXT: .LBB61_603: # %cond.load253 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 65 +; CHECK-RV32-NEXT: li a4, 64 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: bnez a2, .LBB61_604 +; CHECK-RV32-NEXT: j .LBB61_71 +; CHECK-RV32-NEXT: .LBB61_604: # %cond.load257 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 66 +; CHECK-RV32-NEXT: li a4, 65 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: bnez a2, .LBB61_605 +; CHECK-RV32-NEXT: j .LBB61_72 +; CHECK-RV32-NEXT: .LBB61_605: # %cond.load261 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 67 +; CHECK-RV32-NEXT: li a4, 66 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: bnez a2, .LBB61_606 +; CHECK-RV32-NEXT: j .LBB61_73 +; CHECK-RV32-NEXT: .LBB61_606: # %cond.load265 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 68 +; CHECK-RV32-NEXT: li a4, 67 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: bnez a2, .LBB61_607 +; CHECK-RV32-NEXT: j .LBB61_74 +; CHECK-RV32-NEXT: .LBB61_607: # %cond.load269 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 69 +; CHECK-RV32-NEXT: li a4, 68 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: bnez a2, .LBB61_608 +; CHECK-RV32-NEXT: j .LBB61_75 +; CHECK-RV32-NEXT: .LBB61_608: # %cond.load273 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 70 +; CHECK-RV32-NEXT: li a4, 69 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: bnez a2, .LBB61_609 +; CHECK-RV32-NEXT: j .LBB61_76 +; CHECK-RV32-NEXT: .LBB61_609: # %cond.load277 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 71 +; CHECK-RV32-NEXT: li a4, 70 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: bnez a2, .LBB61_610 +; CHECK-RV32-NEXT: j .LBB61_77 +; CHECK-RV32-NEXT: .LBB61_610: # %cond.load281 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 72 +; CHECK-RV32-NEXT: li a4, 71 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: bnez a2, .LBB61_611 +; CHECK-RV32-NEXT: j .LBB61_78 +; CHECK-RV32-NEXT: .LBB61_611: # %cond.load285 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 73 +; CHECK-RV32-NEXT: li a4, 72 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: bnez a2, .LBB61_612 +; CHECK-RV32-NEXT: j .LBB61_79 +; CHECK-RV32-NEXT: .LBB61_612: # %cond.load289 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 74 +; CHECK-RV32-NEXT: li a4, 73 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: bnez a2, .LBB61_613 +; CHECK-RV32-NEXT: j .LBB61_80 +; CHECK-RV32-NEXT: .LBB61_613: # %cond.load293 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 75 +; CHECK-RV32-NEXT: li a4, 74 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bltz a2, .LBB61_614 +; CHECK-RV32-NEXT: j .LBB61_81 +; CHECK-RV32-NEXT: .LBB61_614: # %cond.load297 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 76 +; CHECK-RV32-NEXT: li a4, 75 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bltz a2, .LBB61_615 +; CHECK-RV32-NEXT: j .LBB61_82 +; CHECK-RV32-NEXT: .LBB61_615: # %cond.load301 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 77 +; CHECK-RV32-NEXT: li a4, 76 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bltz a2, .LBB61_616 +; CHECK-RV32-NEXT: j .LBB61_83 +; CHECK-RV32-NEXT: .LBB61_616: # %cond.load305 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 78 +; CHECK-RV32-NEXT: li a4, 77 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bltz a2, .LBB61_617 +; CHECK-RV32-NEXT: j .LBB61_84 +; CHECK-RV32-NEXT: .LBB61_617: # %cond.load309 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 79 +; CHECK-RV32-NEXT: li a4, 78 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bltz a2, .LBB61_618 +; CHECK-RV32-NEXT: j .LBB61_85 +; CHECK-RV32-NEXT: .LBB61_618: # %cond.load313 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 80 +; CHECK-RV32-NEXT: li a4, 79 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bltz a2, .LBB61_619 +; CHECK-RV32-NEXT: j .LBB61_86 +; CHECK-RV32-NEXT: .LBB61_619: # %cond.load317 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 81 +; CHECK-RV32-NEXT: li a4, 80 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bltz a2, .LBB61_620 +; CHECK-RV32-NEXT: j .LBB61_87 +; CHECK-RV32-NEXT: .LBB61_620: # %cond.load321 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 82 +; CHECK-RV32-NEXT: li a4, 81 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bltz a2, .LBB61_621 +; CHECK-RV32-NEXT: j .LBB61_88 +; CHECK-RV32-NEXT: .LBB61_621: # %cond.load325 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 83 +; CHECK-RV32-NEXT: li a4, 82 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bltz a2, .LBB61_622 +; CHECK-RV32-NEXT: j .LBB61_89 +; CHECK-RV32-NEXT: .LBB61_622: # %cond.load329 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 84 +; CHECK-RV32-NEXT: li a4, 83 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bltz a2, .LBB61_623 +; CHECK-RV32-NEXT: j .LBB61_90 +; CHECK-RV32-NEXT: .LBB61_623: # %cond.load333 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 85 +; CHECK-RV32-NEXT: li a4, 84 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bltz a2, .LBB61_624 +; CHECK-RV32-NEXT: j .LBB61_91 +; CHECK-RV32-NEXT: .LBB61_624: # %cond.load337 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 86 +; CHECK-RV32-NEXT: li a4, 85 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bltz a2, .LBB61_625 +; CHECK-RV32-NEXT: j .LBB61_92 +; CHECK-RV32-NEXT: .LBB61_625: # %cond.load341 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 87 +; CHECK-RV32-NEXT: li a4, 86 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bltz a2, .LBB61_626 +; CHECK-RV32-NEXT: j .LBB61_93 +; CHECK-RV32-NEXT: .LBB61_626: # %cond.load345 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 88 +; CHECK-RV32-NEXT: li a4, 87 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bltz a2, .LBB61_627 +; CHECK-RV32-NEXT: j .LBB61_94 +; CHECK-RV32-NEXT: .LBB61_627: # %cond.load349 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 89 +; CHECK-RV32-NEXT: li a4, 88 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bltz a2, .LBB61_628 +; CHECK-RV32-NEXT: j .LBB61_95 +; CHECK-RV32-NEXT: .LBB61_628: # %cond.load353 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 90 +; CHECK-RV32-NEXT: li a4, 89 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bltz a2, .LBB61_629 +; CHECK-RV32-NEXT: j .LBB61_96 +; CHECK-RV32-NEXT: .LBB61_629: # %cond.load357 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 91 +; CHECK-RV32-NEXT: li a4, 90 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bltz a2, .LBB61_630 +; CHECK-RV32-NEXT: j .LBB61_97 +; CHECK-RV32-NEXT: .LBB61_630: # %cond.load361 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 92 +; CHECK-RV32-NEXT: li a4, 91 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bltz a2, .LBB61_631 +; CHECK-RV32-NEXT: j .LBB61_98 +; CHECK-RV32-NEXT: .LBB61_631: # %cond.load365 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 93 +; CHECK-RV32-NEXT: li a4, 92 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_1027 +; CHECK-RV32-NEXT: j .LBB61_99 +; CHECK-RV32-NEXT: .LBB61_1027: # %cond.load365 +; CHECK-RV32-NEXT: j .LBB61_100 +; CHECK-RV32-NEXT: .LBB61_632: # %cond.load377 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 96 +; CHECK-RV32-NEXT: li a4, 95 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: bnez a3, .LBB61_633 +; CHECK-RV32-NEXT: j .LBB61_104 +; CHECK-RV32-NEXT: .LBB61_633: # %cond.load381 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 97 +; CHECK-RV32-NEXT: li a4, 96 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: bnez a3, .LBB61_634 +; CHECK-RV32-NEXT: j .LBB61_105 +; CHECK-RV32-NEXT: .LBB61_634: # %cond.load385 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 98 +; CHECK-RV32-NEXT: li a4, 97 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: bnez a3, .LBB61_635 +; CHECK-RV32-NEXT: j .LBB61_106 +; CHECK-RV32-NEXT: .LBB61_635: # %cond.load389 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 99 +; CHECK-RV32-NEXT: li a4, 98 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: bnez a3, .LBB61_636 +; CHECK-RV32-NEXT: j .LBB61_107 +; CHECK-RV32-NEXT: .LBB61_636: # %cond.load393 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 100 +; CHECK-RV32-NEXT: li a4, 99 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: bnez a3, .LBB61_637 +; CHECK-RV32-NEXT: j .LBB61_108 +; CHECK-RV32-NEXT: .LBB61_637: # %cond.load397 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 101 +; CHECK-RV32-NEXT: li a4, 100 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: bnez a3, .LBB61_638 +; CHECK-RV32-NEXT: j .LBB61_109 +; CHECK-RV32-NEXT: .LBB61_638: # %cond.load401 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 102 +; CHECK-RV32-NEXT: li a4, 101 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: bnez a3, .LBB61_639 +; CHECK-RV32-NEXT: j .LBB61_110 +; CHECK-RV32-NEXT: .LBB61_639: # %cond.load405 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 103 +; CHECK-RV32-NEXT: li a4, 102 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: bnez a3, .LBB61_640 +; CHECK-RV32-NEXT: j .LBB61_111 +; CHECK-RV32-NEXT: .LBB61_640: # %cond.load409 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 104 +; CHECK-RV32-NEXT: li a4, 103 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: bnez a3, .LBB61_641 +; CHECK-RV32-NEXT: j .LBB61_112 +; CHECK-RV32-NEXT: .LBB61_641: # %cond.load413 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 105 +; CHECK-RV32-NEXT: li a4, 104 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: bnez a3, .LBB61_642 +; CHECK-RV32-NEXT: j .LBB61_113 +; CHECK-RV32-NEXT: .LBB61_642: # %cond.load417 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 106 +; CHECK-RV32-NEXT: li a4, 105 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: bnez a3, .LBB61_643 +; CHECK-RV32-NEXT: j .LBB61_114 +; CHECK-RV32-NEXT: .LBB61_643: # %cond.load421 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 107 +; CHECK-RV32-NEXT: li a4, 106 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bltz a3, .LBB61_644 +; CHECK-RV32-NEXT: j .LBB61_115 +; CHECK-RV32-NEXT: .LBB61_644: # %cond.load425 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 108 +; CHECK-RV32-NEXT: li a4, 107 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bltz a3, .LBB61_645 +; CHECK-RV32-NEXT: j .LBB61_116 +; CHECK-RV32-NEXT: .LBB61_645: # %cond.load429 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 109 +; CHECK-RV32-NEXT: li a4, 108 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bltz a3, .LBB61_646 +; CHECK-RV32-NEXT: j .LBB61_117 +; CHECK-RV32-NEXT: .LBB61_646: # %cond.load433 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 110 +; CHECK-RV32-NEXT: li a4, 109 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bltz a3, .LBB61_647 +; CHECK-RV32-NEXT: j .LBB61_118 +; CHECK-RV32-NEXT: .LBB61_647: # %cond.load437 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 111 +; CHECK-RV32-NEXT: li a4, 110 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bltz a3, .LBB61_648 +; CHECK-RV32-NEXT: j .LBB61_119 +; CHECK-RV32-NEXT: .LBB61_648: # %cond.load441 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 112 +; CHECK-RV32-NEXT: li a4, 111 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bltz a3, .LBB61_649 +; CHECK-RV32-NEXT: j .LBB61_120 +; CHECK-RV32-NEXT: .LBB61_649: # %cond.load445 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 113 +; CHECK-RV32-NEXT: li a4, 112 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bltz a3, .LBB61_650 +; CHECK-RV32-NEXT: j .LBB61_121 +; CHECK-RV32-NEXT: .LBB61_650: # %cond.load449 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 114 +; CHECK-RV32-NEXT: li a4, 113 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bltz a3, .LBB61_651 +; CHECK-RV32-NEXT: j .LBB61_122 +; CHECK-RV32-NEXT: .LBB61_651: # %cond.load453 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 115 +; CHECK-RV32-NEXT: li a4, 114 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bltz a3, .LBB61_652 +; CHECK-RV32-NEXT: j .LBB61_123 +; CHECK-RV32-NEXT: .LBB61_652: # %cond.load457 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 116 +; CHECK-RV32-NEXT: li a4, 115 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bltz a3, .LBB61_653 +; CHECK-RV32-NEXT: j .LBB61_124 +; CHECK-RV32-NEXT: .LBB61_653: # %cond.load461 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 117 +; CHECK-RV32-NEXT: li a4, 116 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bltz a3, .LBB61_654 +; CHECK-RV32-NEXT: j .LBB61_125 +; CHECK-RV32-NEXT: .LBB61_654: # %cond.load465 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 118 +; CHECK-RV32-NEXT: li a4, 117 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bltz a3, .LBB61_655 +; CHECK-RV32-NEXT: j .LBB61_126 +; CHECK-RV32-NEXT: .LBB61_655: # %cond.load469 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 119 +; CHECK-RV32-NEXT: li a4, 118 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bltz a3, .LBB61_656 +; CHECK-RV32-NEXT: j .LBB61_127 +; CHECK-RV32-NEXT: .LBB61_656: # %cond.load473 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 120 +; CHECK-RV32-NEXT: li a4, 119 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bltz a3, .LBB61_657 +; CHECK-RV32-NEXT: j .LBB61_128 +; CHECK-RV32-NEXT: .LBB61_657: # %cond.load477 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 121 +; CHECK-RV32-NEXT: li a4, 120 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bltz a3, .LBB61_658 +; CHECK-RV32-NEXT: j .LBB61_129 +; CHECK-RV32-NEXT: .LBB61_658: # %cond.load481 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 122 +; CHECK-RV32-NEXT: li a4, 121 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bltz a3, .LBB61_659 +; CHECK-RV32-NEXT: j .LBB61_130 +; CHECK-RV32-NEXT: .LBB61_659: # %cond.load485 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 123 +; CHECK-RV32-NEXT: li a4, 122 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bltz a3, .LBB61_660 +; CHECK-RV32-NEXT: j .LBB61_131 +; CHECK-RV32-NEXT: .LBB61_660: # %cond.load489 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 124 +; CHECK-RV32-NEXT: li a4, 123 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bltz a3, .LBB61_661 +; CHECK-RV32-NEXT: j .LBB61_132 +; CHECK-RV32-NEXT: .LBB61_661: # %cond.load493 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a3 +; CHECK-RV32-NEXT: vmv8r.v v16, v8 +; CHECK-RV32-NEXT: li a3, 125 +; CHECK-RV32-NEXT: li a4, 124 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v16, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v16 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_1028 +; CHECK-RV32-NEXT: j .LBB61_133 +; CHECK-RV32-NEXT: .LBB61_1028: # %cond.load493 +; CHECK-RV32-NEXT: j .LBB61_134 +; CHECK-RV32-NEXT: .LBB61_662: # %cond.load505 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v18, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 128 +; CHECK-RV32-NEXT: li a4, 127 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v18, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv2r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: bnez a2, .LBB61_663 +; CHECK-RV32-NEXT: j .LBB61_138 +; CHECK-RV32-NEXT: .LBB61_663: # %cond.load509 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 129 +; CHECK-RV32-NEXT: li a4, 128 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: bnez a2, .LBB61_664 +; CHECK-RV32-NEXT: j .LBB61_139 +; CHECK-RV32-NEXT: .LBB61_664: # %cond.load513 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 130 +; CHECK-RV32-NEXT: li a4, 129 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: bnez a2, .LBB61_665 +; CHECK-RV32-NEXT: j .LBB61_140 +; CHECK-RV32-NEXT: .LBB61_665: # %cond.load517 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 131 +; CHECK-RV32-NEXT: li a4, 130 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: bnez a2, .LBB61_666 +; CHECK-RV32-NEXT: j .LBB61_141 +; CHECK-RV32-NEXT: .LBB61_666: # %cond.load521 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 132 +; CHECK-RV32-NEXT: li a4, 131 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: bnez a2, .LBB61_667 +; CHECK-RV32-NEXT: j .LBB61_142 +; CHECK-RV32-NEXT: .LBB61_667: # %cond.load525 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 133 +; CHECK-RV32-NEXT: li a4, 132 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: bnez a2, .LBB61_668 +; CHECK-RV32-NEXT: j .LBB61_143 +; CHECK-RV32-NEXT: .LBB61_668: # %cond.load529 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 134 +; CHECK-RV32-NEXT: li a4, 133 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: bnez a2, .LBB61_669 +; CHECK-RV32-NEXT: j .LBB61_144 +; CHECK-RV32-NEXT: .LBB61_669: # %cond.load533 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 135 +; CHECK-RV32-NEXT: li a4, 134 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: bnez a2, .LBB61_670 +; CHECK-RV32-NEXT: j .LBB61_145 +; CHECK-RV32-NEXT: .LBB61_670: # %cond.load537 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 136 +; CHECK-RV32-NEXT: li a4, 135 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: bnez a2, .LBB61_671 +; CHECK-RV32-NEXT: j .LBB61_146 +; CHECK-RV32-NEXT: .LBB61_671: # %cond.load541 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 137 +; CHECK-RV32-NEXT: li a4, 136 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: bnez a2, .LBB61_672 +; CHECK-RV32-NEXT: j .LBB61_147 +; CHECK-RV32-NEXT: .LBB61_672: # %cond.load545 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 138 +; CHECK-RV32-NEXT: li a4, 137 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: bnez a2, .LBB61_673 +; CHECK-RV32-NEXT: j .LBB61_148 +; CHECK-RV32-NEXT: .LBB61_673: # %cond.load549 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 139 +; CHECK-RV32-NEXT: li a4, 138 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bltz a2, .LBB61_674 +; CHECK-RV32-NEXT: j .LBB61_149 +; CHECK-RV32-NEXT: .LBB61_674: # %cond.load553 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 140 +; CHECK-RV32-NEXT: li a4, 139 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bltz a2, .LBB61_675 +; CHECK-RV32-NEXT: j .LBB61_150 +; CHECK-RV32-NEXT: .LBB61_675: # %cond.load557 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 141 +; CHECK-RV32-NEXT: li a4, 140 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bltz a2, .LBB61_676 +; CHECK-RV32-NEXT: j .LBB61_151 +; CHECK-RV32-NEXT: .LBB61_676: # %cond.load561 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 142 +; CHECK-RV32-NEXT: li a4, 141 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bltz a2, .LBB61_677 +; CHECK-RV32-NEXT: j .LBB61_152 +; CHECK-RV32-NEXT: .LBB61_677: # %cond.load565 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 143 +; CHECK-RV32-NEXT: li a4, 142 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bltz a2, .LBB61_678 +; CHECK-RV32-NEXT: j .LBB61_153 +; CHECK-RV32-NEXT: .LBB61_678: # %cond.load569 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 144 +; CHECK-RV32-NEXT: li a4, 143 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bltz a2, .LBB61_679 +; CHECK-RV32-NEXT: j .LBB61_154 +; CHECK-RV32-NEXT: .LBB61_679: # %cond.load573 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 145 +; CHECK-RV32-NEXT: li a4, 144 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bltz a2, .LBB61_680 +; CHECK-RV32-NEXT: j .LBB61_155 +; CHECK-RV32-NEXT: .LBB61_680: # %cond.load577 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 146 +; CHECK-RV32-NEXT: li a4, 145 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bltz a2, .LBB61_681 +; CHECK-RV32-NEXT: j .LBB61_156 +; CHECK-RV32-NEXT: .LBB61_681: # %cond.load581 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 147 +; CHECK-RV32-NEXT: li a4, 146 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bltz a2, .LBB61_682 +; CHECK-RV32-NEXT: j .LBB61_157 +; CHECK-RV32-NEXT: .LBB61_682: # %cond.load585 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 148 +; CHECK-RV32-NEXT: li a4, 147 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bltz a2, .LBB61_683 +; CHECK-RV32-NEXT: j .LBB61_158 +; CHECK-RV32-NEXT: .LBB61_683: # %cond.load589 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 149 +; CHECK-RV32-NEXT: li a4, 148 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bltz a2, .LBB61_684 +; CHECK-RV32-NEXT: j .LBB61_159 +; CHECK-RV32-NEXT: .LBB61_684: # %cond.load593 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 150 +; CHECK-RV32-NEXT: li a4, 149 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bltz a2, .LBB61_685 +; CHECK-RV32-NEXT: j .LBB61_160 +; CHECK-RV32-NEXT: .LBB61_685: # %cond.load597 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 151 +; CHECK-RV32-NEXT: li a4, 150 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bltz a2, .LBB61_686 +; CHECK-RV32-NEXT: j .LBB61_161 +; CHECK-RV32-NEXT: .LBB61_686: # %cond.load601 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 152 +; CHECK-RV32-NEXT: li a4, 151 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bltz a2, .LBB61_687 +; CHECK-RV32-NEXT: j .LBB61_162 +; CHECK-RV32-NEXT: .LBB61_687: # %cond.load605 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 153 +; CHECK-RV32-NEXT: li a4, 152 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bltz a2, .LBB61_688 +; CHECK-RV32-NEXT: j .LBB61_163 +; CHECK-RV32-NEXT: .LBB61_688: # %cond.load609 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 154 +; CHECK-RV32-NEXT: li a4, 153 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bltz a2, .LBB61_689 +; CHECK-RV32-NEXT: j .LBB61_164 +; CHECK-RV32-NEXT: .LBB61_689: # %cond.load613 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 155 +; CHECK-RV32-NEXT: li a4, 154 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bltz a2, .LBB61_690 +; CHECK-RV32-NEXT: j .LBB61_165 +; CHECK-RV32-NEXT: .LBB61_690: # %cond.load617 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 156 +; CHECK-RV32-NEXT: li a4, 155 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bltz a2, .LBB61_691 +; CHECK-RV32-NEXT: j .LBB61_166 +; CHECK-RV32-NEXT: .LBB61_691: # %cond.load621 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 157 +; CHECK-RV32-NEXT: li a4, 156 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_1029 +; CHECK-RV32-NEXT: j .LBB61_167 +; CHECK-RV32-NEXT: .LBB61_1029: # %cond.load621 +; CHECK-RV32-NEXT: j .LBB61_168 +; CHECK-RV32-NEXT: .LBB61_692: # %cond.load633 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 160 +; CHECK-RV32-NEXT: li a4, 159 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: bnez a3, .LBB61_693 +; CHECK-RV32-NEXT: j .LBB61_172 +; CHECK-RV32-NEXT: .LBB61_693: # %cond.load637 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 161 +; CHECK-RV32-NEXT: li a4, 160 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: bnez a3, .LBB61_694 +; CHECK-RV32-NEXT: j .LBB61_173 +; CHECK-RV32-NEXT: .LBB61_694: # %cond.load641 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 162 +; CHECK-RV32-NEXT: li a4, 161 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: bnez a3, .LBB61_695 +; CHECK-RV32-NEXT: j .LBB61_174 +; CHECK-RV32-NEXT: .LBB61_695: # %cond.load645 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 163 +; CHECK-RV32-NEXT: li a4, 162 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: bnez a3, .LBB61_696 +; CHECK-RV32-NEXT: j .LBB61_175 +; CHECK-RV32-NEXT: .LBB61_696: # %cond.load649 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 164 +; CHECK-RV32-NEXT: li a4, 163 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: bnez a3, .LBB61_697 +; CHECK-RV32-NEXT: j .LBB61_176 +; CHECK-RV32-NEXT: .LBB61_697: # %cond.load653 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 165 +; CHECK-RV32-NEXT: li a4, 164 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: bnez a3, .LBB61_698 +; CHECK-RV32-NEXT: j .LBB61_177 +; CHECK-RV32-NEXT: .LBB61_698: # %cond.load657 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 166 +; CHECK-RV32-NEXT: li a4, 165 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: bnez a3, .LBB61_699 +; CHECK-RV32-NEXT: j .LBB61_178 +; CHECK-RV32-NEXT: .LBB61_699: # %cond.load661 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 167 +; CHECK-RV32-NEXT: li a4, 166 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: bnez a3, .LBB61_700 +; CHECK-RV32-NEXT: j .LBB61_179 +; CHECK-RV32-NEXT: .LBB61_700: # %cond.load665 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 168 +; CHECK-RV32-NEXT: li a4, 167 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: bnez a3, .LBB61_701 +; CHECK-RV32-NEXT: j .LBB61_180 +; CHECK-RV32-NEXT: .LBB61_701: # %cond.load669 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 169 +; CHECK-RV32-NEXT: li a4, 168 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: bnez a3, .LBB61_702 +; CHECK-RV32-NEXT: j .LBB61_181 +; CHECK-RV32-NEXT: .LBB61_702: # %cond.load673 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 170 +; CHECK-RV32-NEXT: li a4, 169 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: bnez a3, .LBB61_703 +; CHECK-RV32-NEXT: j .LBB61_182 +; CHECK-RV32-NEXT: .LBB61_703: # %cond.load677 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 171 +; CHECK-RV32-NEXT: li a4, 170 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bltz a3, .LBB61_704 +; CHECK-RV32-NEXT: j .LBB61_183 +; CHECK-RV32-NEXT: .LBB61_704: # %cond.load681 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 172 +; CHECK-RV32-NEXT: li a4, 171 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bltz a3, .LBB61_705 +; CHECK-RV32-NEXT: j .LBB61_184 +; CHECK-RV32-NEXT: .LBB61_705: # %cond.load685 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 173 +; CHECK-RV32-NEXT: li a4, 172 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bltz a3, .LBB61_706 +; CHECK-RV32-NEXT: j .LBB61_185 +; CHECK-RV32-NEXT: .LBB61_706: # %cond.load689 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 174 +; CHECK-RV32-NEXT: li a4, 173 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bltz a3, .LBB61_707 +; CHECK-RV32-NEXT: j .LBB61_186 +; CHECK-RV32-NEXT: .LBB61_707: # %cond.load693 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 175 +; CHECK-RV32-NEXT: li a4, 174 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bltz a3, .LBB61_708 +; CHECK-RV32-NEXT: j .LBB61_187 +; CHECK-RV32-NEXT: .LBB61_708: # %cond.load697 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 176 +; CHECK-RV32-NEXT: li a4, 175 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bltz a3, .LBB61_709 +; CHECK-RV32-NEXT: j .LBB61_188 +; CHECK-RV32-NEXT: .LBB61_709: # %cond.load701 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 177 +; CHECK-RV32-NEXT: li a4, 176 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bltz a3, .LBB61_710 +; CHECK-RV32-NEXT: j .LBB61_189 +; CHECK-RV32-NEXT: .LBB61_710: # %cond.load705 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 178 +; CHECK-RV32-NEXT: li a4, 177 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bltz a3, .LBB61_711 +; CHECK-RV32-NEXT: j .LBB61_190 +; CHECK-RV32-NEXT: .LBB61_711: # %cond.load709 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 179 +; CHECK-RV32-NEXT: li a4, 178 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bltz a3, .LBB61_712 +; CHECK-RV32-NEXT: j .LBB61_191 +; CHECK-RV32-NEXT: .LBB61_712: # %cond.load713 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 180 +; CHECK-RV32-NEXT: li a4, 179 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bltz a3, .LBB61_713 +; CHECK-RV32-NEXT: j .LBB61_192 +; CHECK-RV32-NEXT: .LBB61_713: # %cond.load717 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 181 +; CHECK-RV32-NEXT: li a4, 180 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bltz a3, .LBB61_714 +; CHECK-RV32-NEXT: j .LBB61_193 +; CHECK-RV32-NEXT: .LBB61_714: # %cond.load721 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 182 +; CHECK-RV32-NEXT: li a4, 181 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bltz a3, .LBB61_715 +; CHECK-RV32-NEXT: j .LBB61_194 +; CHECK-RV32-NEXT: .LBB61_715: # %cond.load725 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 183 +; CHECK-RV32-NEXT: li a4, 182 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bltz a3, .LBB61_716 +; CHECK-RV32-NEXT: j .LBB61_195 +; CHECK-RV32-NEXT: .LBB61_716: # %cond.load729 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 184 +; CHECK-RV32-NEXT: li a4, 183 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bltz a3, .LBB61_717 +; CHECK-RV32-NEXT: j .LBB61_196 +; CHECK-RV32-NEXT: .LBB61_717: # %cond.load733 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 185 +; CHECK-RV32-NEXT: li a4, 184 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bltz a3, .LBB61_718 +; CHECK-RV32-NEXT: j .LBB61_197 +; CHECK-RV32-NEXT: .LBB61_718: # %cond.load737 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 186 +; CHECK-RV32-NEXT: li a4, 185 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bltz a3, .LBB61_719 +; CHECK-RV32-NEXT: j .LBB61_198 +; CHECK-RV32-NEXT: .LBB61_719: # %cond.load741 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 187 +; CHECK-RV32-NEXT: li a4, 186 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bltz a3, .LBB61_720 +; CHECK-RV32-NEXT: j .LBB61_199 +; CHECK-RV32-NEXT: .LBB61_720: # %cond.load745 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 188 +; CHECK-RV32-NEXT: li a4, 187 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bltz a3, .LBB61_721 +; CHECK-RV32-NEXT: j .LBB61_200 +; CHECK-RV32-NEXT: .LBB61_721: # %cond.load749 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 189 +; CHECK-RV32-NEXT: li a4, 188 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_1030 +; CHECK-RV32-NEXT: j .LBB61_201 +; CHECK-RV32-NEXT: .LBB61_1030: # %cond.load749 +; CHECK-RV32-NEXT: j .LBB61_202 +; CHECK-RV32-NEXT: .LBB61_722: # %cond.load761 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 192 +; CHECK-RV32-NEXT: li a4, 191 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: bnez a2, .LBB61_723 +; CHECK-RV32-NEXT: j .LBB61_206 +; CHECK-RV32-NEXT: .LBB61_723: # %cond.load765 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 193 +; CHECK-RV32-NEXT: li a4, 192 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: bnez a2, .LBB61_724 +; CHECK-RV32-NEXT: j .LBB61_207 +; CHECK-RV32-NEXT: .LBB61_724: # %cond.load769 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 194 +; CHECK-RV32-NEXT: li a4, 193 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: bnez a2, .LBB61_725 +; CHECK-RV32-NEXT: j .LBB61_208 +; CHECK-RV32-NEXT: .LBB61_725: # %cond.load773 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 195 +; CHECK-RV32-NEXT: li a4, 194 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: bnez a2, .LBB61_726 +; CHECK-RV32-NEXT: j .LBB61_209 +; CHECK-RV32-NEXT: .LBB61_726: # %cond.load777 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 196 +; CHECK-RV32-NEXT: li a4, 195 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: bnez a2, .LBB61_727 +; CHECK-RV32-NEXT: j .LBB61_210 +; CHECK-RV32-NEXT: .LBB61_727: # %cond.load781 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 197 +; CHECK-RV32-NEXT: li a4, 196 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: bnez a2, .LBB61_728 +; CHECK-RV32-NEXT: j .LBB61_211 +; CHECK-RV32-NEXT: .LBB61_728: # %cond.load785 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 198 +; CHECK-RV32-NEXT: li a4, 197 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: bnez a2, .LBB61_729 +; CHECK-RV32-NEXT: j .LBB61_212 +; CHECK-RV32-NEXT: .LBB61_729: # %cond.load789 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 199 +; CHECK-RV32-NEXT: li a4, 198 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: bnez a2, .LBB61_730 +; CHECK-RV32-NEXT: j .LBB61_213 +; CHECK-RV32-NEXT: .LBB61_730: # %cond.load793 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 200 +; CHECK-RV32-NEXT: li a4, 199 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: bnez a2, .LBB61_731 +; CHECK-RV32-NEXT: j .LBB61_214 +; CHECK-RV32-NEXT: .LBB61_731: # %cond.load797 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 201 +; CHECK-RV32-NEXT: li a4, 200 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: bnez a2, .LBB61_732 +; CHECK-RV32-NEXT: j .LBB61_215 +; CHECK-RV32-NEXT: .LBB61_732: # %cond.load801 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 202 +; CHECK-RV32-NEXT: li a4, 201 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: bnez a2, .LBB61_733 +; CHECK-RV32-NEXT: j .LBB61_216 +; CHECK-RV32-NEXT: .LBB61_733: # %cond.load805 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 203 +; CHECK-RV32-NEXT: li a4, 202 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bltz a2, .LBB61_734 +; CHECK-RV32-NEXT: j .LBB61_217 +; CHECK-RV32-NEXT: .LBB61_734: # %cond.load809 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 204 +; CHECK-RV32-NEXT: li a4, 203 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bltz a2, .LBB61_735 +; CHECK-RV32-NEXT: j .LBB61_218 +; CHECK-RV32-NEXT: .LBB61_735: # %cond.load813 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 205 +; CHECK-RV32-NEXT: li a4, 204 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bltz a2, .LBB61_736 +; CHECK-RV32-NEXT: j .LBB61_219 +; CHECK-RV32-NEXT: .LBB61_736: # %cond.load817 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 206 +; CHECK-RV32-NEXT: li a4, 205 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bltz a2, .LBB61_737 +; CHECK-RV32-NEXT: j .LBB61_220 +; CHECK-RV32-NEXT: .LBB61_737: # %cond.load821 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 207 +; CHECK-RV32-NEXT: li a4, 206 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bltz a2, .LBB61_738 +; CHECK-RV32-NEXT: j .LBB61_221 +; CHECK-RV32-NEXT: .LBB61_738: # %cond.load825 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 208 +; CHECK-RV32-NEXT: li a4, 207 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bltz a2, .LBB61_739 +; CHECK-RV32-NEXT: j .LBB61_222 +; CHECK-RV32-NEXT: .LBB61_739: # %cond.load829 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 209 +; CHECK-RV32-NEXT: li a4, 208 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bltz a2, .LBB61_740 +; CHECK-RV32-NEXT: j .LBB61_223 +; CHECK-RV32-NEXT: .LBB61_740: # %cond.load833 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 210 +; CHECK-RV32-NEXT: li a4, 209 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bltz a2, .LBB61_741 +; CHECK-RV32-NEXT: j .LBB61_224 +; CHECK-RV32-NEXT: .LBB61_741: # %cond.load837 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 211 +; CHECK-RV32-NEXT: li a4, 210 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bltz a2, .LBB61_742 +; CHECK-RV32-NEXT: j .LBB61_225 +; CHECK-RV32-NEXT: .LBB61_742: # %cond.load841 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 212 +; CHECK-RV32-NEXT: li a4, 211 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bltz a2, .LBB61_743 +; CHECK-RV32-NEXT: j .LBB61_226 +; CHECK-RV32-NEXT: .LBB61_743: # %cond.load845 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 213 +; CHECK-RV32-NEXT: li a4, 212 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bltz a2, .LBB61_744 +; CHECK-RV32-NEXT: j .LBB61_227 +; CHECK-RV32-NEXT: .LBB61_744: # %cond.load849 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 214 +; CHECK-RV32-NEXT: li a4, 213 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bltz a2, .LBB61_745 +; CHECK-RV32-NEXT: j .LBB61_228 +; CHECK-RV32-NEXT: .LBB61_745: # %cond.load853 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 215 +; CHECK-RV32-NEXT: li a4, 214 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bltz a2, .LBB61_746 +; CHECK-RV32-NEXT: j .LBB61_229 +; CHECK-RV32-NEXT: .LBB61_746: # %cond.load857 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 216 +; CHECK-RV32-NEXT: li a4, 215 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bltz a2, .LBB61_747 +; CHECK-RV32-NEXT: j .LBB61_230 +; CHECK-RV32-NEXT: .LBB61_747: # %cond.load861 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 217 +; CHECK-RV32-NEXT: li a4, 216 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bltz a2, .LBB61_748 +; CHECK-RV32-NEXT: j .LBB61_231 +; CHECK-RV32-NEXT: .LBB61_748: # %cond.load865 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 218 +; CHECK-RV32-NEXT: li a4, 217 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bltz a2, .LBB61_749 +; CHECK-RV32-NEXT: j .LBB61_232 +; CHECK-RV32-NEXT: .LBB61_749: # %cond.load869 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 219 +; CHECK-RV32-NEXT: li a4, 218 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bltz a2, .LBB61_750 +; CHECK-RV32-NEXT: j .LBB61_233 +; CHECK-RV32-NEXT: .LBB61_750: # %cond.load873 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 220 +; CHECK-RV32-NEXT: li a4, 219 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bltz a2, .LBB61_751 +; CHECK-RV32-NEXT: j .LBB61_234 +; CHECK-RV32-NEXT: .LBB61_751: # %cond.load877 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 221 +; CHECK-RV32-NEXT: li a4, 220 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_1031 +; CHECK-RV32-NEXT: j .LBB61_235 +; CHECK-RV32-NEXT: .LBB61_1031: # %cond.load877 +; CHECK-RV32-NEXT: j .LBB61_236 +; CHECK-RV32-NEXT: .LBB61_752: # %cond.load889 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 224 +; CHECK-RV32-NEXT: li a4, 223 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: bnez a3, .LBB61_753 +; CHECK-RV32-NEXT: j .LBB61_240 +; CHECK-RV32-NEXT: .LBB61_753: # %cond.load893 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 225 +; CHECK-RV32-NEXT: li a4, 224 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: bnez a3, .LBB61_754 +; CHECK-RV32-NEXT: j .LBB61_241 +; CHECK-RV32-NEXT: .LBB61_754: # %cond.load897 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 226 +; CHECK-RV32-NEXT: li a4, 225 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: bnez a3, .LBB61_755 +; CHECK-RV32-NEXT: j .LBB61_242 +; CHECK-RV32-NEXT: .LBB61_755: # %cond.load901 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 227 +; CHECK-RV32-NEXT: li a4, 226 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: bnez a3, .LBB61_756 +; CHECK-RV32-NEXT: j .LBB61_243 +; CHECK-RV32-NEXT: .LBB61_756: # %cond.load905 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 228 +; CHECK-RV32-NEXT: li a4, 227 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: bnez a3, .LBB61_757 +; CHECK-RV32-NEXT: j .LBB61_244 +; CHECK-RV32-NEXT: .LBB61_757: # %cond.load909 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 229 +; CHECK-RV32-NEXT: li a4, 228 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: bnez a3, .LBB61_758 +; CHECK-RV32-NEXT: j .LBB61_245 +; CHECK-RV32-NEXT: .LBB61_758: # %cond.load913 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 230 +; CHECK-RV32-NEXT: li a4, 229 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: bnez a3, .LBB61_759 +; CHECK-RV32-NEXT: j .LBB61_246 +; CHECK-RV32-NEXT: .LBB61_759: # %cond.load917 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 231 +; CHECK-RV32-NEXT: li a4, 230 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: bnez a3, .LBB61_760 +; CHECK-RV32-NEXT: j .LBB61_247 +; CHECK-RV32-NEXT: .LBB61_760: # %cond.load921 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 232 +; CHECK-RV32-NEXT: li a4, 231 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: bnez a3, .LBB61_761 +; CHECK-RV32-NEXT: j .LBB61_248 +; CHECK-RV32-NEXT: .LBB61_761: # %cond.load925 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 233 +; CHECK-RV32-NEXT: li a4, 232 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: bnez a3, .LBB61_762 +; CHECK-RV32-NEXT: j .LBB61_249 +; CHECK-RV32-NEXT: .LBB61_762: # %cond.load929 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 234 +; CHECK-RV32-NEXT: li a4, 233 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: bnez a3, .LBB61_763 +; CHECK-RV32-NEXT: j .LBB61_250 +; CHECK-RV32-NEXT: .LBB61_763: # %cond.load933 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 235 +; CHECK-RV32-NEXT: li a4, 234 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bltz a3, .LBB61_764 +; CHECK-RV32-NEXT: j .LBB61_251 +; CHECK-RV32-NEXT: .LBB61_764: # %cond.load937 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 236 +; CHECK-RV32-NEXT: li a4, 235 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bltz a3, .LBB61_765 +; CHECK-RV32-NEXT: j .LBB61_252 +; CHECK-RV32-NEXT: .LBB61_765: # %cond.load941 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 237 +; CHECK-RV32-NEXT: li a4, 236 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bltz a3, .LBB61_766 +; CHECK-RV32-NEXT: j .LBB61_253 +; CHECK-RV32-NEXT: .LBB61_766: # %cond.load945 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 238 +; CHECK-RV32-NEXT: li a4, 237 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bltz a3, .LBB61_767 +; CHECK-RV32-NEXT: j .LBB61_254 +; CHECK-RV32-NEXT: .LBB61_767: # %cond.load949 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 239 +; CHECK-RV32-NEXT: li a4, 238 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bltz a3, .LBB61_768 +; CHECK-RV32-NEXT: j .LBB61_255 +; CHECK-RV32-NEXT: .LBB61_768: # %cond.load953 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 240 +; CHECK-RV32-NEXT: li a4, 239 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bltz a3, .LBB61_769 +; CHECK-RV32-NEXT: j .LBB61_256 +; CHECK-RV32-NEXT: .LBB61_769: # %cond.load957 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 241 +; CHECK-RV32-NEXT: li a4, 240 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bltz a3, .LBB61_770 +; CHECK-RV32-NEXT: j .LBB61_257 +; CHECK-RV32-NEXT: .LBB61_770: # %cond.load961 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 242 +; CHECK-RV32-NEXT: li a4, 241 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bltz a3, .LBB61_771 +; CHECK-RV32-NEXT: j .LBB61_258 +; CHECK-RV32-NEXT: .LBB61_771: # %cond.load965 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 243 +; CHECK-RV32-NEXT: li a4, 242 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bltz a3, .LBB61_772 +; CHECK-RV32-NEXT: j .LBB61_259 +; CHECK-RV32-NEXT: .LBB61_772: # %cond.load969 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 244 +; CHECK-RV32-NEXT: li a4, 243 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bltz a3, .LBB61_773 +; CHECK-RV32-NEXT: j .LBB61_260 +; CHECK-RV32-NEXT: .LBB61_773: # %cond.load973 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 245 +; CHECK-RV32-NEXT: li a4, 244 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bltz a3, .LBB61_774 +; CHECK-RV32-NEXT: j .LBB61_261 +; CHECK-RV32-NEXT: .LBB61_774: # %cond.load977 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 246 +; CHECK-RV32-NEXT: li a4, 245 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bltz a3, .LBB61_775 +; CHECK-RV32-NEXT: j .LBB61_262 +; CHECK-RV32-NEXT: .LBB61_775: # %cond.load981 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 247 +; CHECK-RV32-NEXT: li a4, 246 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bltz a3, .LBB61_776 +; CHECK-RV32-NEXT: j .LBB61_263 +; CHECK-RV32-NEXT: .LBB61_776: # %cond.load985 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 248 +; CHECK-RV32-NEXT: li a4, 247 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bltz a3, .LBB61_777 +; CHECK-RV32-NEXT: j .LBB61_264 +; CHECK-RV32-NEXT: .LBB61_777: # %cond.load989 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 249 +; CHECK-RV32-NEXT: li a4, 248 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bltz a3, .LBB61_778 +; CHECK-RV32-NEXT: j .LBB61_265 +; CHECK-RV32-NEXT: .LBB61_778: # %cond.load993 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 250 +; CHECK-RV32-NEXT: li a4, 249 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bltz a3, .LBB61_779 +; CHECK-RV32-NEXT: j .LBB61_266 +; CHECK-RV32-NEXT: .LBB61_779: # %cond.load997 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 251 +; CHECK-RV32-NEXT: li a4, 250 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bltz a3, .LBB61_780 +; CHECK-RV32-NEXT: j .LBB61_267 +; CHECK-RV32-NEXT: .LBB61_780: # %cond.load1001 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 252 +; CHECK-RV32-NEXT: li a4, 251 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bltz a3, .LBB61_781 +; CHECK-RV32-NEXT: j .LBB61_268 +; CHECK-RV32-NEXT: .LBB61_781: # %cond.load1005 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a3, 253 +; CHECK-RV32-NEXT: li a4, 252 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_1032 +; CHECK-RV32-NEXT: j .LBB61_269 +; CHECK-RV32-NEXT: .LBB61_1032: # %cond.load1005 +; CHECK-RV32-NEXT: j .LBB61_270 +; CHECK-RV32-NEXT: .LBB61_782: # %cond.load1017 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v20, a2 +; CHECK-RV32-NEXT: vmv8r.v v24, v8 +; CHECK-RV32-NEXT: li a2, 256 +; CHECK-RV32-NEXT: li a4, 255 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v20, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: vmv4r.v v24, v8 +; CHECK-RV32-NEXT: vmv8r.v v8, v24 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: bnez a2, .LBB61_783 +; CHECK-RV32-NEXT: j .LBB61_274 +; CHECK-RV32-NEXT: .LBB61_783: # %cond.load1021 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 257 +; CHECK-RV32-NEXT: li a4, 256 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: bnez a2, .LBB61_784 +; CHECK-RV32-NEXT: j .LBB61_275 +; CHECK-RV32-NEXT: .LBB61_784: # %cond.load1025 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 258 +; CHECK-RV32-NEXT: li a4, 257 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: bnez a2, .LBB61_785 +; CHECK-RV32-NEXT: j .LBB61_276 +; CHECK-RV32-NEXT: .LBB61_785: # %cond.load1029 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 259 +; CHECK-RV32-NEXT: li a4, 258 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: bnez a2, .LBB61_786 +; CHECK-RV32-NEXT: j .LBB61_277 +; CHECK-RV32-NEXT: .LBB61_786: # %cond.load1033 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 260 +; CHECK-RV32-NEXT: li a4, 259 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: bnez a2, .LBB61_787 +; CHECK-RV32-NEXT: j .LBB61_278 +; CHECK-RV32-NEXT: .LBB61_787: # %cond.load1037 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 261 +; CHECK-RV32-NEXT: li a4, 260 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: bnez a2, .LBB61_788 +; CHECK-RV32-NEXT: j .LBB61_279 +; CHECK-RV32-NEXT: .LBB61_788: # %cond.load1041 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 262 +; CHECK-RV32-NEXT: li a4, 261 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: bnez a2, .LBB61_789 +; CHECK-RV32-NEXT: j .LBB61_280 +; CHECK-RV32-NEXT: .LBB61_789: # %cond.load1045 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 263 +; CHECK-RV32-NEXT: li a4, 262 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: bnez a2, .LBB61_790 +; CHECK-RV32-NEXT: j .LBB61_281 +; CHECK-RV32-NEXT: .LBB61_790: # %cond.load1049 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 264 +; CHECK-RV32-NEXT: li a4, 263 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: bnez a2, .LBB61_791 +; CHECK-RV32-NEXT: j .LBB61_282 +; CHECK-RV32-NEXT: .LBB61_791: # %cond.load1053 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 265 +; CHECK-RV32-NEXT: li a4, 264 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: bnez a2, .LBB61_792 +; CHECK-RV32-NEXT: j .LBB61_283 +; CHECK-RV32-NEXT: .LBB61_792: # %cond.load1057 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 266 +; CHECK-RV32-NEXT: li a4, 265 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: bnez a2, .LBB61_793 +; CHECK-RV32-NEXT: j .LBB61_284 +; CHECK-RV32-NEXT: .LBB61_793: # %cond.load1061 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 267 +; CHECK-RV32-NEXT: li a4, 266 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bltz a2, .LBB61_794 +; CHECK-RV32-NEXT: j .LBB61_285 +; CHECK-RV32-NEXT: .LBB61_794: # %cond.load1065 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 268 +; CHECK-RV32-NEXT: li a4, 267 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bltz a2, .LBB61_795 +; CHECK-RV32-NEXT: j .LBB61_286 +; CHECK-RV32-NEXT: .LBB61_795: # %cond.load1069 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 269 +; CHECK-RV32-NEXT: li a4, 268 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bltz a2, .LBB61_796 +; CHECK-RV32-NEXT: j .LBB61_287 +; CHECK-RV32-NEXT: .LBB61_796: # %cond.load1073 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 270 +; CHECK-RV32-NEXT: li a4, 269 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bltz a2, .LBB61_797 +; CHECK-RV32-NEXT: j .LBB61_288 +; CHECK-RV32-NEXT: .LBB61_797: # %cond.load1077 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 271 +; CHECK-RV32-NEXT: li a4, 270 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bltz a2, .LBB61_798 +; CHECK-RV32-NEXT: j .LBB61_289 +; CHECK-RV32-NEXT: .LBB61_798: # %cond.load1081 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 272 +; CHECK-RV32-NEXT: li a4, 271 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bltz a2, .LBB61_799 +; CHECK-RV32-NEXT: j .LBB61_290 +; CHECK-RV32-NEXT: .LBB61_799: # %cond.load1085 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 273 +; CHECK-RV32-NEXT: li a4, 272 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bltz a2, .LBB61_800 +; CHECK-RV32-NEXT: j .LBB61_291 +; CHECK-RV32-NEXT: .LBB61_800: # %cond.load1089 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 274 +; CHECK-RV32-NEXT: li a4, 273 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bltz a2, .LBB61_801 +; CHECK-RV32-NEXT: j .LBB61_292 +; CHECK-RV32-NEXT: .LBB61_801: # %cond.load1093 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 275 +; CHECK-RV32-NEXT: li a4, 274 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bltz a2, .LBB61_802 +; CHECK-RV32-NEXT: j .LBB61_293 +; CHECK-RV32-NEXT: .LBB61_802: # %cond.load1097 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 276 +; CHECK-RV32-NEXT: li a4, 275 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bltz a2, .LBB61_803 +; CHECK-RV32-NEXT: j .LBB61_294 +; CHECK-RV32-NEXT: .LBB61_803: # %cond.load1101 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 277 +; CHECK-RV32-NEXT: li a4, 276 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bltz a2, .LBB61_804 +; CHECK-RV32-NEXT: j .LBB61_295 +; CHECK-RV32-NEXT: .LBB61_804: # %cond.load1105 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 278 +; CHECK-RV32-NEXT: li a4, 277 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bltz a2, .LBB61_805 +; CHECK-RV32-NEXT: j .LBB61_296 +; CHECK-RV32-NEXT: .LBB61_805: # %cond.load1109 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 279 +; CHECK-RV32-NEXT: li a4, 278 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bltz a2, .LBB61_806 +; CHECK-RV32-NEXT: j .LBB61_297 +; CHECK-RV32-NEXT: .LBB61_806: # %cond.load1113 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 280 +; CHECK-RV32-NEXT: li a4, 279 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bltz a2, .LBB61_807 +; CHECK-RV32-NEXT: j .LBB61_298 +; CHECK-RV32-NEXT: .LBB61_807: # %cond.load1117 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 281 +; CHECK-RV32-NEXT: li a4, 280 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bltz a2, .LBB61_808 +; CHECK-RV32-NEXT: j .LBB61_299 +; CHECK-RV32-NEXT: .LBB61_808: # %cond.load1121 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 282 +; CHECK-RV32-NEXT: li a4, 281 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bltz a2, .LBB61_809 +; CHECK-RV32-NEXT: j .LBB61_300 +; CHECK-RV32-NEXT: .LBB61_809: # %cond.load1125 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 283 +; CHECK-RV32-NEXT: li a4, 282 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bltz a2, .LBB61_810 +; CHECK-RV32-NEXT: j .LBB61_301 +; CHECK-RV32-NEXT: .LBB61_810: # %cond.load1129 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 284 +; CHECK-RV32-NEXT: li a4, 283 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bltz a2, .LBB61_811 +; CHECK-RV32-NEXT: j .LBB61_302 +; CHECK-RV32-NEXT: .LBB61_811: # %cond.load1133 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 285 +; CHECK-RV32-NEXT: li a4, 284 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_1033 +; CHECK-RV32-NEXT: j .LBB61_303 +; CHECK-RV32-NEXT: .LBB61_1033: # %cond.load1133 +; CHECK-RV32-NEXT: j .LBB61_304 +; CHECK-RV32-NEXT: .LBB61_812: # %cond.load1145 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 288 +; CHECK-RV32-NEXT: li a4, 287 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: bnez a3, .LBB61_813 +; CHECK-RV32-NEXT: j .LBB61_308 +; CHECK-RV32-NEXT: .LBB61_813: # %cond.load1149 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 289 +; CHECK-RV32-NEXT: li a4, 288 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: bnez a3, .LBB61_814 +; CHECK-RV32-NEXT: j .LBB61_309 +; CHECK-RV32-NEXT: .LBB61_814: # %cond.load1153 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 290 +; CHECK-RV32-NEXT: li a4, 289 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: bnez a3, .LBB61_815 +; CHECK-RV32-NEXT: j .LBB61_310 +; CHECK-RV32-NEXT: .LBB61_815: # %cond.load1157 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 291 +; CHECK-RV32-NEXT: li a4, 290 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: bnez a3, .LBB61_816 +; CHECK-RV32-NEXT: j .LBB61_311 +; CHECK-RV32-NEXT: .LBB61_816: # %cond.load1161 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 292 +; CHECK-RV32-NEXT: li a4, 291 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: bnez a3, .LBB61_817 +; CHECK-RV32-NEXT: j .LBB61_312 +; CHECK-RV32-NEXT: .LBB61_817: # %cond.load1165 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 293 +; CHECK-RV32-NEXT: li a4, 292 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: bnez a3, .LBB61_818 +; CHECK-RV32-NEXT: j .LBB61_313 +; CHECK-RV32-NEXT: .LBB61_818: # %cond.load1169 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 294 +; CHECK-RV32-NEXT: li a4, 293 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: bnez a3, .LBB61_819 +; CHECK-RV32-NEXT: j .LBB61_314 +; CHECK-RV32-NEXT: .LBB61_819: # %cond.load1173 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 295 +; CHECK-RV32-NEXT: li a4, 294 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: bnez a3, .LBB61_820 +; CHECK-RV32-NEXT: j .LBB61_315 +; CHECK-RV32-NEXT: .LBB61_820: # %cond.load1177 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 296 +; CHECK-RV32-NEXT: li a4, 295 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: bnez a3, .LBB61_821 +; CHECK-RV32-NEXT: j .LBB61_316 +; CHECK-RV32-NEXT: .LBB61_821: # %cond.load1181 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 297 +; CHECK-RV32-NEXT: li a4, 296 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: bnez a3, .LBB61_822 +; CHECK-RV32-NEXT: j .LBB61_317 +; CHECK-RV32-NEXT: .LBB61_822: # %cond.load1185 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 298 +; CHECK-RV32-NEXT: li a4, 297 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: bnez a3, .LBB61_823 +; CHECK-RV32-NEXT: j .LBB61_318 +; CHECK-RV32-NEXT: .LBB61_823: # %cond.load1189 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 299 +; CHECK-RV32-NEXT: li a4, 298 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bltz a3, .LBB61_824 +; CHECK-RV32-NEXT: j .LBB61_319 +; CHECK-RV32-NEXT: .LBB61_824: # %cond.load1193 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 300 +; CHECK-RV32-NEXT: li a4, 299 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bltz a3, .LBB61_825 +; CHECK-RV32-NEXT: j .LBB61_320 +; CHECK-RV32-NEXT: .LBB61_825: # %cond.load1197 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 301 +; CHECK-RV32-NEXT: li a4, 300 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bltz a3, .LBB61_826 +; CHECK-RV32-NEXT: j .LBB61_321 +; CHECK-RV32-NEXT: .LBB61_826: # %cond.load1201 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 302 +; CHECK-RV32-NEXT: li a4, 301 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bltz a3, .LBB61_827 +; CHECK-RV32-NEXT: j .LBB61_322 +; CHECK-RV32-NEXT: .LBB61_827: # %cond.load1205 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 303 +; CHECK-RV32-NEXT: li a4, 302 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bltz a3, .LBB61_828 +; CHECK-RV32-NEXT: j .LBB61_323 +; CHECK-RV32-NEXT: .LBB61_828: # %cond.load1209 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 304 +; CHECK-RV32-NEXT: li a4, 303 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bltz a3, .LBB61_829 +; CHECK-RV32-NEXT: j .LBB61_324 +; CHECK-RV32-NEXT: .LBB61_829: # %cond.load1213 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 305 +; CHECK-RV32-NEXT: li a4, 304 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bltz a3, .LBB61_830 +; CHECK-RV32-NEXT: j .LBB61_325 +; CHECK-RV32-NEXT: .LBB61_830: # %cond.load1217 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 306 +; CHECK-RV32-NEXT: li a4, 305 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bltz a3, .LBB61_831 +; CHECK-RV32-NEXT: j .LBB61_326 +; CHECK-RV32-NEXT: .LBB61_831: # %cond.load1221 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 307 +; CHECK-RV32-NEXT: li a4, 306 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bltz a3, .LBB61_832 +; CHECK-RV32-NEXT: j .LBB61_327 +; CHECK-RV32-NEXT: .LBB61_832: # %cond.load1225 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 308 +; CHECK-RV32-NEXT: li a4, 307 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bltz a3, .LBB61_833 +; CHECK-RV32-NEXT: j .LBB61_328 +; CHECK-RV32-NEXT: .LBB61_833: # %cond.load1229 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 309 +; CHECK-RV32-NEXT: li a4, 308 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bltz a3, .LBB61_834 +; CHECK-RV32-NEXT: j .LBB61_329 +; CHECK-RV32-NEXT: .LBB61_834: # %cond.load1233 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 310 +; CHECK-RV32-NEXT: li a4, 309 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bltz a3, .LBB61_835 +; CHECK-RV32-NEXT: j .LBB61_330 +; CHECK-RV32-NEXT: .LBB61_835: # %cond.load1237 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 311 +; CHECK-RV32-NEXT: li a4, 310 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bltz a3, .LBB61_836 +; CHECK-RV32-NEXT: j .LBB61_331 +; CHECK-RV32-NEXT: .LBB61_836: # %cond.load1241 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 312 +; CHECK-RV32-NEXT: li a4, 311 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bltz a3, .LBB61_837 +; CHECK-RV32-NEXT: j .LBB61_332 +; CHECK-RV32-NEXT: .LBB61_837: # %cond.load1245 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 313 +; CHECK-RV32-NEXT: li a4, 312 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bltz a3, .LBB61_838 +; CHECK-RV32-NEXT: j .LBB61_333 +; CHECK-RV32-NEXT: .LBB61_838: # %cond.load1249 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 314 +; CHECK-RV32-NEXT: li a4, 313 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bltz a3, .LBB61_839 +; CHECK-RV32-NEXT: j .LBB61_334 +; CHECK-RV32-NEXT: .LBB61_839: # %cond.load1253 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 315 +; CHECK-RV32-NEXT: li a4, 314 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bltz a3, .LBB61_840 +; CHECK-RV32-NEXT: j .LBB61_335 +; CHECK-RV32-NEXT: .LBB61_840: # %cond.load1257 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 316 +; CHECK-RV32-NEXT: li a4, 315 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bltz a3, .LBB61_841 +; CHECK-RV32-NEXT: j .LBB61_336 +; CHECK-RV32-NEXT: .LBB61_841: # %cond.load1261 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 317 +; CHECK-RV32-NEXT: li a4, 316 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_1034 +; CHECK-RV32-NEXT: j .LBB61_337 +; CHECK-RV32-NEXT: .LBB61_1034: # %cond.load1261 +; CHECK-RV32-NEXT: j .LBB61_338 +; CHECK-RV32-NEXT: .LBB61_842: # %cond.load1273 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 320 +; CHECK-RV32-NEXT: li a4, 319 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: bnez a2, .LBB61_843 +; CHECK-RV32-NEXT: j .LBB61_342 +; CHECK-RV32-NEXT: .LBB61_843: # %cond.load1277 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 321 +; CHECK-RV32-NEXT: li a4, 320 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: bnez a2, .LBB61_844 +; CHECK-RV32-NEXT: j .LBB61_343 +; CHECK-RV32-NEXT: .LBB61_844: # %cond.load1281 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 322 +; CHECK-RV32-NEXT: li a4, 321 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: bnez a2, .LBB61_845 +; CHECK-RV32-NEXT: j .LBB61_344 +; CHECK-RV32-NEXT: .LBB61_845: # %cond.load1285 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 323 +; CHECK-RV32-NEXT: li a4, 322 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: bnez a2, .LBB61_846 +; CHECK-RV32-NEXT: j .LBB61_345 +; CHECK-RV32-NEXT: .LBB61_846: # %cond.load1289 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 324 +; CHECK-RV32-NEXT: li a4, 323 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: bnez a2, .LBB61_847 +; CHECK-RV32-NEXT: j .LBB61_346 +; CHECK-RV32-NEXT: .LBB61_847: # %cond.load1293 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 325 +; CHECK-RV32-NEXT: li a4, 324 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: bnez a2, .LBB61_848 +; CHECK-RV32-NEXT: j .LBB61_347 +; CHECK-RV32-NEXT: .LBB61_848: # %cond.load1297 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 326 +; CHECK-RV32-NEXT: li a4, 325 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: bnez a2, .LBB61_849 +; CHECK-RV32-NEXT: j .LBB61_348 +; CHECK-RV32-NEXT: .LBB61_849: # %cond.load1301 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 327 +; CHECK-RV32-NEXT: li a4, 326 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: bnez a2, .LBB61_850 +; CHECK-RV32-NEXT: j .LBB61_349 +; CHECK-RV32-NEXT: .LBB61_850: # %cond.load1305 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 328 +; CHECK-RV32-NEXT: li a4, 327 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: bnez a2, .LBB61_851 +; CHECK-RV32-NEXT: j .LBB61_350 +; CHECK-RV32-NEXT: .LBB61_851: # %cond.load1309 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 329 +; CHECK-RV32-NEXT: li a4, 328 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: bnez a2, .LBB61_852 +; CHECK-RV32-NEXT: j .LBB61_351 +; CHECK-RV32-NEXT: .LBB61_852: # %cond.load1313 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 330 +; CHECK-RV32-NEXT: li a4, 329 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: bnez a2, .LBB61_853 +; CHECK-RV32-NEXT: j .LBB61_352 +; CHECK-RV32-NEXT: .LBB61_853: # %cond.load1317 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 331 +; CHECK-RV32-NEXT: li a4, 330 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bltz a2, .LBB61_854 +; CHECK-RV32-NEXT: j .LBB61_353 +; CHECK-RV32-NEXT: .LBB61_854: # %cond.load1321 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 332 +; CHECK-RV32-NEXT: li a4, 331 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bltz a2, .LBB61_855 +; CHECK-RV32-NEXT: j .LBB61_354 +; CHECK-RV32-NEXT: .LBB61_855: # %cond.load1325 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 333 +; CHECK-RV32-NEXT: li a4, 332 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bltz a2, .LBB61_856 +; CHECK-RV32-NEXT: j .LBB61_355 +; CHECK-RV32-NEXT: .LBB61_856: # %cond.load1329 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 334 +; CHECK-RV32-NEXT: li a4, 333 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bltz a2, .LBB61_857 +; CHECK-RV32-NEXT: j .LBB61_356 +; CHECK-RV32-NEXT: .LBB61_857: # %cond.load1333 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 335 +; CHECK-RV32-NEXT: li a4, 334 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bltz a2, .LBB61_858 +; CHECK-RV32-NEXT: j .LBB61_357 +; CHECK-RV32-NEXT: .LBB61_858: # %cond.load1337 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 336 +; CHECK-RV32-NEXT: li a4, 335 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bltz a2, .LBB61_859 +; CHECK-RV32-NEXT: j .LBB61_358 +; CHECK-RV32-NEXT: .LBB61_859: # %cond.load1341 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 337 +; CHECK-RV32-NEXT: li a4, 336 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bltz a2, .LBB61_860 +; CHECK-RV32-NEXT: j .LBB61_359 +; CHECK-RV32-NEXT: .LBB61_860: # %cond.load1345 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 338 +; CHECK-RV32-NEXT: li a4, 337 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bltz a2, .LBB61_861 +; CHECK-RV32-NEXT: j .LBB61_360 +; CHECK-RV32-NEXT: .LBB61_861: # %cond.load1349 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 339 +; CHECK-RV32-NEXT: li a4, 338 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bltz a2, .LBB61_862 +; CHECK-RV32-NEXT: j .LBB61_361 +; CHECK-RV32-NEXT: .LBB61_862: # %cond.load1353 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 340 +; CHECK-RV32-NEXT: li a4, 339 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bltz a2, .LBB61_863 +; CHECK-RV32-NEXT: j .LBB61_362 +; CHECK-RV32-NEXT: .LBB61_863: # %cond.load1357 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 341 +; CHECK-RV32-NEXT: li a4, 340 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bltz a2, .LBB61_864 +; CHECK-RV32-NEXT: j .LBB61_363 +; CHECK-RV32-NEXT: .LBB61_864: # %cond.load1361 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 342 +; CHECK-RV32-NEXT: li a4, 341 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bltz a2, .LBB61_865 +; CHECK-RV32-NEXT: j .LBB61_364 +; CHECK-RV32-NEXT: .LBB61_865: # %cond.load1365 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 343 +; CHECK-RV32-NEXT: li a4, 342 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bltz a2, .LBB61_866 +; CHECK-RV32-NEXT: j .LBB61_365 +; CHECK-RV32-NEXT: .LBB61_866: # %cond.load1369 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 344 +; CHECK-RV32-NEXT: li a4, 343 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bltz a2, .LBB61_867 +; CHECK-RV32-NEXT: j .LBB61_366 +; CHECK-RV32-NEXT: .LBB61_867: # %cond.load1373 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 345 +; CHECK-RV32-NEXT: li a4, 344 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bltz a2, .LBB61_868 +; CHECK-RV32-NEXT: j .LBB61_367 +; CHECK-RV32-NEXT: .LBB61_868: # %cond.load1377 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 346 +; CHECK-RV32-NEXT: li a4, 345 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bltz a2, .LBB61_869 +; CHECK-RV32-NEXT: j .LBB61_368 +; CHECK-RV32-NEXT: .LBB61_869: # %cond.load1381 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 347 +; CHECK-RV32-NEXT: li a4, 346 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bltz a2, .LBB61_870 +; CHECK-RV32-NEXT: j .LBB61_369 +; CHECK-RV32-NEXT: .LBB61_870: # %cond.load1385 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 348 +; CHECK-RV32-NEXT: li a4, 347 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bltz a2, .LBB61_871 +; CHECK-RV32-NEXT: j .LBB61_370 +; CHECK-RV32-NEXT: .LBB61_871: # %cond.load1389 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 349 +; CHECK-RV32-NEXT: li a4, 348 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_1035 +; CHECK-RV32-NEXT: j .LBB61_371 +; CHECK-RV32-NEXT: .LBB61_1035: # %cond.load1389 +; CHECK-RV32-NEXT: j .LBB61_372 +; CHECK-RV32-NEXT: .LBB61_872: # %cond.load1401 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 352 +; CHECK-RV32-NEXT: li a4, 351 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: bnez a3, .LBB61_873 +; CHECK-RV32-NEXT: j .LBB61_376 +; CHECK-RV32-NEXT: .LBB61_873: # %cond.load1405 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 353 +; CHECK-RV32-NEXT: li a4, 352 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: bnez a3, .LBB61_874 +; CHECK-RV32-NEXT: j .LBB61_377 +; CHECK-RV32-NEXT: .LBB61_874: # %cond.load1409 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 354 +; CHECK-RV32-NEXT: li a4, 353 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: bnez a3, .LBB61_875 +; CHECK-RV32-NEXT: j .LBB61_378 +; CHECK-RV32-NEXT: .LBB61_875: # %cond.load1413 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 355 +; CHECK-RV32-NEXT: li a4, 354 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: bnez a3, .LBB61_876 +; CHECK-RV32-NEXT: j .LBB61_379 +; CHECK-RV32-NEXT: .LBB61_876: # %cond.load1417 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 356 +; CHECK-RV32-NEXT: li a4, 355 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: bnez a3, .LBB61_877 +; CHECK-RV32-NEXT: j .LBB61_380 +; CHECK-RV32-NEXT: .LBB61_877: # %cond.load1421 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 357 +; CHECK-RV32-NEXT: li a4, 356 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: bnez a3, .LBB61_878 +; CHECK-RV32-NEXT: j .LBB61_381 +; CHECK-RV32-NEXT: .LBB61_878: # %cond.load1425 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 358 +; CHECK-RV32-NEXT: li a4, 357 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: bnez a3, .LBB61_879 +; CHECK-RV32-NEXT: j .LBB61_382 +; CHECK-RV32-NEXT: .LBB61_879: # %cond.load1429 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 359 +; CHECK-RV32-NEXT: li a4, 358 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: bnez a3, .LBB61_880 +; CHECK-RV32-NEXT: j .LBB61_383 +; CHECK-RV32-NEXT: .LBB61_880: # %cond.load1433 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 360 +; CHECK-RV32-NEXT: li a4, 359 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: bnez a3, .LBB61_881 +; CHECK-RV32-NEXT: j .LBB61_384 +; CHECK-RV32-NEXT: .LBB61_881: # %cond.load1437 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 361 +; CHECK-RV32-NEXT: li a4, 360 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: bnez a3, .LBB61_882 +; CHECK-RV32-NEXT: j .LBB61_385 +; CHECK-RV32-NEXT: .LBB61_882: # %cond.load1441 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 362 +; CHECK-RV32-NEXT: li a4, 361 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: bnez a3, .LBB61_883 +; CHECK-RV32-NEXT: j .LBB61_386 +; CHECK-RV32-NEXT: .LBB61_883: # %cond.load1445 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 363 +; CHECK-RV32-NEXT: li a4, 362 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bltz a3, .LBB61_884 +; CHECK-RV32-NEXT: j .LBB61_387 +; CHECK-RV32-NEXT: .LBB61_884: # %cond.load1449 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 364 +; CHECK-RV32-NEXT: li a4, 363 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bltz a3, .LBB61_885 +; CHECK-RV32-NEXT: j .LBB61_388 +; CHECK-RV32-NEXT: .LBB61_885: # %cond.load1453 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 365 +; CHECK-RV32-NEXT: li a4, 364 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bltz a3, .LBB61_886 +; CHECK-RV32-NEXT: j .LBB61_389 +; CHECK-RV32-NEXT: .LBB61_886: # %cond.load1457 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 366 +; CHECK-RV32-NEXT: li a4, 365 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bltz a3, .LBB61_887 +; CHECK-RV32-NEXT: j .LBB61_390 +; CHECK-RV32-NEXT: .LBB61_887: # %cond.load1461 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 367 +; CHECK-RV32-NEXT: li a4, 366 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bltz a3, .LBB61_888 +; CHECK-RV32-NEXT: j .LBB61_391 +; CHECK-RV32-NEXT: .LBB61_888: # %cond.load1465 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 368 +; CHECK-RV32-NEXT: li a4, 367 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bltz a3, .LBB61_889 +; CHECK-RV32-NEXT: j .LBB61_392 +; CHECK-RV32-NEXT: .LBB61_889: # %cond.load1469 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 369 +; CHECK-RV32-NEXT: li a4, 368 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bltz a3, .LBB61_890 +; CHECK-RV32-NEXT: j .LBB61_393 +; CHECK-RV32-NEXT: .LBB61_890: # %cond.load1473 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 370 +; CHECK-RV32-NEXT: li a4, 369 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bltz a3, .LBB61_891 +; CHECK-RV32-NEXT: j .LBB61_394 +; CHECK-RV32-NEXT: .LBB61_891: # %cond.load1477 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 371 +; CHECK-RV32-NEXT: li a4, 370 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bltz a3, .LBB61_892 +; CHECK-RV32-NEXT: j .LBB61_395 +; CHECK-RV32-NEXT: .LBB61_892: # %cond.load1481 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 372 +; CHECK-RV32-NEXT: li a4, 371 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bltz a3, .LBB61_893 +; CHECK-RV32-NEXT: j .LBB61_396 +; CHECK-RV32-NEXT: .LBB61_893: # %cond.load1485 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 373 +; CHECK-RV32-NEXT: li a4, 372 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bltz a3, .LBB61_894 +; CHECK-RV32-NEXT: j .LBB61_397 +; CHECK-RV32-NEXT: .LBB61_894: # %cond.load1489 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 374 +; CHECK-RV32-NEXT: li a4, 373 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bltz a3, .LBB61_895 +; CHECK-RV32-NEXT: j .LBB61_398 +; CHECK-RV32-NEXT: .LBB61_895: # %cond.load1493 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 375 +; CHECK-RV32-NEXT: li a4, 374 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bltz a3, .LBB61_896 +; CHECK-RV32-NEXT: j .LBB61_399 +; CHECK-RV32-NEXT: .LBB61_896: # %cond.load1497 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 376 +; CHECK-RV32-NEXT: li a4, 375 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bltz a3, .LBB61_897 +; CHECK-RV32-NEXT: j .LBB61_400 +; CHECK-RV32-NEXT: .LBB61_897: # %cond.load1501 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 377 +; CHECK-RV32-NEXT: li a4, 376 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bltz a3, .LBB61_898 +; CHECK-RV32-NEXT: j .LBB61_401 +; CHECK-RV32-NEXT: .LBB61_898: # %cond.load1505 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 378 +; CHECK-RV32-NEXT: li a4, 377 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bltz a3, .LBB61_899 +; CHECK-RV32-NEXT: j .LBB61_402 +; CHECK-RV32-NEXT: .LBB61_899: # %cond.load1509 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 379 +; CHECK-RV32-NEXT: li a4, 378 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bltz a3, .LBB61_900 +; CHECK-RV32-NEXT: j .LBB61_403 +; CHECK-RV32-NEXT: .LBB61_900: # %cond.load1513 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 380 +; CHECK-RV32-NEXT: li a4, 379 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bltz a3, .LBB61_901 +; CHECK-RV32-NEXT: j .LBB61_404 +; CHECK-RV32-NEXT: .LBB61_901: # %cond.load1517 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 381 +; CHECK-RV32-NEXT: li a4, 380 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_1036 +; CHECK-RV32-NEXT: j .LBB61_405 +; CHECK-RV32-NEXT: .LBB61_1036: # %cond.load1517 +; CHECK-RV32-NEXT: j .LBB61_406 +; CHECK-RV32-NEXT: .LBB61_902: # %cond.load1529 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 384 +; CHECK-RV32-NEXT: li a4, 383 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: bnez a2, .LBB61_903 +; CHECK-RV32-NEXT: j .LBB61_410 +; CHECK-RV32-NEXT: .LBB61_903: # %cond.load1533 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 385 +; CHECK-RV32-NEXT: li a4, 384 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: bnez a2, .LBB61_904 +; CHECK-RV32-NEXT: j .LBB61_411 +; CHECK-RV32-NEXT: .LBB61_904: # %cond.load1537 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 386 +; CHECK-RV32-NEXT: li a4, 385 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: bnez a2, .LBB61_905 +; CHECK-RV32-NEXT: j .LBB61_412 +; CHECK-RV32-NEXT: .LBB61_905: # %cond.load1541 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 387 +; CHECK-RV32-NEXT: li a4, 386 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: bnez a2, .LBB61_906 +; CHECK-RV32-NEXT: j .LBB61_413 +; CHECK-RV32-NEXT: .LBB61_906: # %cond.load1545 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 388 +; CHECK-RV32-NEXT: li a4, 387 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: bnez a2, .LBB61_907 +; CHECK-RV32-NEXT: j .LBB61_414 +; CHECK-RV32-NEXT: .LBB61_907: # %cond.load1549 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 389 +; CHECK-RV32-NEXT: li a4, 388 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: bnez a2, .LBB61_908 +; CHECK-RV32-NEXT: j .LBB61_415 +; CHECK-RV32-NEXT: .LBB61_908: # %cond.load1553 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 390 +; CHECK-RV32-NEXT: li a4, 389 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: bnez a2, .LBB61_909 +; CHECK-RV32-NEXT: j .LBB61_416 +; CHECK-RV32-NEXT: .LBB61_909: # %cond.load1557 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 391 +; CHECK-RV32-NEXT: li a4, 390 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: bnez a2, .LBB61_910 +; CHECK-RV32-NEXT: j .LBB61_417 +; CHECK-RV32-NEXT: .LBB61_910: # %cond.load1561 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 392 +; CHECK-RV32-NEXT: li a4, 391 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: bnez a2, .LBB61_911 +; CHECK-RV32-NEXT: j .LBB61_418 +; CHECK-RV32-NEXT: .LBB61_911: # %cond.load1565 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 393 +; CHECK-RV32-NEXT: li a4, 392 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: bnez a2, .LBB61_912 +; CHECK-RV32-NEXT: j .LBB61_419 +; CHECK-RV32-NEXT: .LBB61_912: # %cond.load1569 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 394 +; CHECK-RV32-NEXT: li a4, 393 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: bnez a2, .LBB61_913 +; CHECK-RV32-NEXT: j .LBB61_420 +; CHECK-RV32-NEXT: .LBB61_913: # %cond.load1573 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 395 +; CHECK-RV32-NEXT: li a4, 394 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bltz a2, .LBB61_914 +; CHECK-RV32-NEXT: j .LBB61_421 +; CHECK-RV32-NEXT: .LBB61_914: # %cond.load1577 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 396 +; CHECK-RV32-NEXT: li a4, 395 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bltz a2, .LBB61_915 +; CHECK-RV32-NEXT: j .LBB61_422 +; CHECK-RV32-NEXT: .LBB61_915: # %cond.load1581 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 397 +; CHECK-RV32-NEXT: li a4, 396 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bltz a2, .LBB61_916 +; CHECK-RV32-NEXT: j .LBB61_423 +; CHECK-RV32-NEXT: .LBB61_916: # %cond.load1585 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 398 +; CHECK-RV32-NEXT: li a4, 397 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bltz a2, .LBB61_917 +; CHECK-RV32-NEXT: j .LBB61_424 +; CHECK-RV32-NEXT: .LBB61_917: # %cond.load1589 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 399 +; CHECK-RV32-NEXT: li a4, 398 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bltz a2, .LBB61_918 +; CHECK-RV32-NEXT: j .LBB61_425 +; CHECK-RV32-NEXT: .LBB61_918: # %cond.load1593 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 400 +; CHECK-RV32-NEXT: li a4, 399 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bltz a2, .LBB61_919 +; CHECK-RV32-NEXT: j .LBB61_426 +; CHECK-RV32-NEXT: .LBB61_919: # %cond.load1597 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 401 +; CHECK-RV32-NEXT: li a4, 400 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bltz a2, .LBB61_920 +; CHECK-RV32-NEXT: j .LBB61_427 +; CHECK-RV32-NEXT: .LBB61_920: # %cond.load1601 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 402 +; CHECK-RV32-NEXT: li a4, 401 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bltz a2, .LBB61_921 +; CHECK-RV32-NEXT: j .LBB61_428 +; CHECK-RV32-NEXT: .LBB61_921: # %cond.load1605 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 403 +; CHECK-RV32-NEXT: li a4, 402 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bltz a2, .LBB61_922 +; CHECK-RV32-NEXT: j .LBB61_429 +; CHECK-RV32-NEXT: .LBB61_922: # %cond.load1609 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 404 +; CHECK-RV32-NEXT: li a4, 403 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bltz a2, .LBB61_923 +; CHECK-RV32-NEXT: j .LBB61_430 +; CHECK-RV32-NEXT: .LBB61_923: # %cond.load1613 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 405 +; CHECK-RV32-NEXT: li a4, 404 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bltz a2, .LBB61_924 +; CHECK-RV32-NEXT: j .LBB61_431 +; CHECK-RV32-NEXT: .LBB61_924: # %cond.load1617 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 406 +; CHECK-RV32-NEXT: li a4, 405 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bltz a2, .LBB61_925 +; CHECK-RV32-NEXT: j .LBB61_432 +; CHECK-RV32-NEXT: .LBB61_925: # %cond.load1621 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 407 +; CHECK-RV32-NEXT: li a4, 406 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bltz a2, .LBB61_926 +; CHECK-RV32-NEXT: j .LBB61_433 +; CHECK-RV32-NEXT: .LBB61_926: # %cond.load1625 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 408 +; CHECK-RV32-NEXT: li a4, 407 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bltz a2, .LBB61_927 +; CHECK-RV32-NEXT: j .LBB61_434 +; CHECK-RV32-NEXT: .LBB61_927: # %cond.load1629 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 409 +; CHECK-RV32-NEXT: li a4, 408 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bltz a2, .LBB61_928 +; CHECK-RV32-NEXT: j .LBB61_435 +; CHECK-RV32-NEXT: .LBB61_928: # %cond.load1633 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 410 +; CHECK-RV32-NEXT: li a4, 409 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bltz a2, .LBB61_929 +; CHECK-RV32-NEXT: j .LBB61_436 +; CHECK-RV32-NEXT: .LBB61_929: # %cond.load1637 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 411 +; CHECK-RV32-NEXT: li a4, 410 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bltz a2, .LBB61_930 +; CHECK-RV32-NEXT: j .LBB61_437 +; CHECK-RV32-NEXT: .LBB61_930: # %cond.load1641 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 412 +; CHECK-RV32-NEXT: li a4, 411 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bltz a2, .LBB61_931 +; CHECK-RV32-NEXT: j .LBB61_438 +; CHECK-RV32-NEXT: .LBB61_931: # %cond.load1645 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 413 +; CHECK-RV32-NEXT: li a4, 412 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_1037 +; CHECK-RV32-NEXT: j .LBB61_439 +; CHECK-RV32-NEXT: .LBB61_1037: # %cond.load1645 +; CHECK-RV32-NEXT: j .LBB61_440 +; CHECK-RV32-NEXT: .LBB61_932: # %cond.load1657 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 416 +; CHECK-RV32-NEXT: li a4, 415 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 1 +; CHECK-RV32-NEXT: bnez a3, .LBB61_933 +; CHECK-RV32-NEXT: j .LBB61_444 +; CHECK-RV32-NEXT: .LBB61_933: # %cond.load1661 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 417 +; CHECK-RV32-NEXT: li a4, 416 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 2 +; CHECK-RV32-NEXT: bnez a3, .LBB61_934 +; CHECK-RV32-NEXT: j .LBB61_445 +; CHECK-RV32-NEXT: .LBB61_934: # %cond.load1665 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 418 +; CHECK-RV32-NEXT: li a4, 417 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 4 +; CHECK-RV32-NEXT: bnez a3, .LBB61_935 +; CHECK-RV32-NEXT: j .LBB61_446 +; CHECK-RV32-NEXT: .LBB61_935: # %cond.load1669 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 419 +; CHECK-RV32-NEXT: li a4, 418 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 8 +; CHECK-RV32-NEXT: bnez a3, .LBB61_936 +; CHECK-RV32-NEXT: j .LBB61_447 +; CHECK-RV32-NEXT: .LBB61_936: # %cond.load1673 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 420 +; CHECK-RV32-NEXT: li a4, 419 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 16 +; CHECK-RV32-NEXT: bnez a3, .LBB61_937 +; CHECK-RV32-NEXT: j .LBB61_448 +; CHECK-RV32-NEXT: .LBB61_937: # %cond.load1677 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 421 +; CHECK-RV32-NEXT: li a4, 420 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 32 +; CHECK-RV32-NEXT: bnez a3, .LBB61_938 +; CHECK-RV32-NEXT: j .LBB61_449 +; CHECK-RV32-NEXT: .LBB61_938: # %cond.load1681 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 422 +; CHECK-RV32-NEXT: li a4, 421 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 64 +; CHECK-RV32-NEXT: bnez a3, .LBB61_939 +; CHECK-RV32-NEXT: j .LBB61_450 +; CHECK-RV32-NEXT: .LBB61_939: # %cond.load1685 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 423 +; CHECK-RV32-NEXT: li a4, 422 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 128 +; CHECK-RV32-NEXT: bnez a3, .LBB61_940 +; CHECK-RV32-NEXT: j .LBB61_451 +; CHECK-RV32-NEXT: .LBB61_940: # %cond.load1689 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 424 +; CHECK-RV32-NEXT: li a4, 423 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 256 +; CHECK-RV32-NEXT: bnez a3, .LBB61_941 +; CHECK-RV32-NEXT: j .LBB61_452 +; CHECK-RV32-NEXT: .LBB61_941: # %cond.load1693 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 425 +; CHECK-RV32-NEXT: li a4, 424 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 512 +; CHECK-RV32-NEXT: bnez a3, .LBB61_942 +; CHECK-RV32-NEXT: j .LBB61_453 +; CHECK-RV32-NEXT: .LBB61_942: # %cond.load1697 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 426 +; CHECK-RV32-NEXT: li a4, 425 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a3, a2, 1024 +; CHECK-RV32-NEXT: bnez a3, .LBB61_943 +; CHECK-RV32-NEXT: j .LBB61_454 +; CHECK-RV32-NEXT: .LBB61_943: # %cond.load1701 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 427 +; CHECK-RV32-NEXT: li a4, 426 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 20 +; CHECK-RV32-NEXT: bltz a3, .LBB61_944 +; CHECK-RV32-NEXT: j .LBB61_455 +; CHECK-RV32-NEXT: .LBB61_944: # %cond.load1705 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 428 +; CHECK-RV32-NEXT: li a4, 427 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 19 +; CHECK-RV32-NEXT: bltz a3, .LBB61_945 +; CHECK-RV32-NEXT: j .LBB61_456 +; CHECK-RV32-NEXT: .LBB61_945: # %cond.load1709 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 429 +; CHECK-RV32-NEXT: li a4, 428 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 18 +; CHECK-RV32-NEXT: bltz a3, .LBB61_946 +; CHECK-RV32-NEXT: j .LBB61_457 +; CHECK-RV32-NEXT: .LBB61_946: # %cond.load1713 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 430 +; CHECK-RV32-NEXT: li a4, 429 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 17 +; CHECK-RV32-NEXT: bltz a3, .LBB61_947 +; CHECK-RV32-NEXT: j .LBB61_458 +; CHECK-RV32-NEXT: .LBB61_947: # %cond.load1717 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 431 +; CHECK-RV32-NEXT: li a4, 430 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 16 +; CHECK-RV32-NEXT: bltz a3, .LBB61_948 +; CHECK-RV32-NEXT: j .LBB61_459 +; CHECK-RV32-NEXT: .LBB61_948: # %cond.load1721 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 432 +; CHECK-RV32-NEXT: li a4, 431 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 15 +; CHECK-RV32-NEXT: bltz a3, .LBB61_949 +; CHECK-RV32-NEXT: j .LBB61_460 +; CHECK-RV32-NEXT: .LBB61_949: # %cond.load1725 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 433 +; CHECK-RV32-NEXT: li a4, 432 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 14 +; CHECK-RV32-NEXT: bltz a3, .LBB61_950 +; CHECK-RV32-NEXT: j .LBB61_461 +; CHECK-RV32-NEXT: .LBB61_950: # %cond.load1729 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 434 +; CHECK-RV32-NEXT: li a4, 433 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 13 +; CHECK-RV32-NEXT: bltz a3, .LBB61_951 +; CHECK-RV32-NEXT: j .LBB61_462 +; CHECK-RV32-NEXT: .LBB61_951: # %cond.load1733 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 435 +; CHECK-RV32-NEXT: li a4, 434 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 12 +; CHECK-RV32-NEXT: bltz a3, .LBB61_952 +; CHECK-RV32-NEXT: j .LBB61_463 +; CHECK-RV32-NEXT: .LBB61_952: # %cond.load1737 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 436 +; CHECK-RV32-NEXT: li a4, 435 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 11 +; CHECK-RV32-NEXT: bltz a3, .LBB61_953 +; CHECK-RV32-NEXT: j .LBB61_464 +; CHECK-RV32-NEXT: .LBB61_953: # %cond.load1741 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 437 +; CHECK-RV32-NEXT: li a4, 436 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 10 +; CHECK-RV32-NEXT: bltz a3, .LBB61_954 +; CHECK-RV32-NEXT: j .LBB61_465 +; CHECK-RV32-NEXT: .LBB61_954: # %cond.load1745 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 438 +; CHECK-RV32-NEXT: li a4, 437 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 9 +; CHECK-RV32-NEXT: bltz a3, .LBB61_955 +; CHECK-RV32-NEXT: j .LBB61_466 +; CHECK-RV32-NEXT: .LBB61_955: # %cond.load1749 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 439 +; CHECK-RV32-NEXT: li a4, 438 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 8 +; CHECK-RV32-NEXT: bltz a3, .LBB61_956 +; CHECK-RV32-NEXT: j .LBB61_467 +; CHECK-RV32-NEXT: .LBB61_956: # %cond.load1753 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 440 +; CHECK-RV32-NEXT: li a4, 439 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 7 +; CHECK-RV32-NEXT: bltz a3, .LBB61_957 +; CHECK-RV32-NEXT: j .LBB61_468 +; CHECK-RV32-NEXT: .LBB61_957: # %cond.load1757 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 441 +; CHECK-RV32-NEXT: li a4, 440 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 6 +; CHECK-RV32-NEXT: bltz a3, .LBB61_958 +; CHECK-RV32-NEXT: j .LBB61_469 +; CHECK-RV32-NEXT: .LBB61_958: # %cond.load1761 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 442 +; CHECK-RV32-NEXT: li a4, 441 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 5 +; CHECK-RV32-NEXT: bltz a3, .LBB61_959 +; CHECK-RV32-NEXT: j .LBB61_470 +; CHECK-RV32-NEXT: .LBB61_959: # %cond.load1765 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 443 +; CHECK-RV32-NEXT: li a4, 442 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 4 +; CHECK-RV32-NEXT: bltz a3, .LBB61_960 +; CHECK-RV32-NEXT: j .LBB61_471 +; CHECK-RV32-NEXT: .LBB61_960: # %cond.load1769 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 444 +; CHECK-RV32-NEXT: li a4, 443 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 3 +; CHECK-RV32-NEXT: bltz a3, .LBB61_961 +; CHECK-RV32-NEXT: j .LBB61_472 +; CHECK-RV32-NEXT: .LBB61_961: # %cond.load1773 +; CHECK-RV32-NEXT: lbu a3, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a3 +; CHECK-RV32-NEXT: li a3, 445 +; CHECK-RV32-NEXT: li a4, 444 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a3, a2, 2 +; CHECK-RV32-NEXT: bgez a3, .LBB61_1038 +; CHECK-RV32-NEXT: j .LBB61_473 +; CHECK-RV32-NEXT: .LBB61_1038: # %cond.load1773 +; CHECK-RV32-NEXT: j .LBB61_474 +; CHECK-RV32-NEXT: .LBB61_962: # %cond.load1785 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 448 +; CHECK-RV32-NEXT: li a4, 447 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 1 +; CHECK-RV32-NEXT: bnez a2, .LBB61_963 +; CHECK-RV32-NEXT: j .LBB61_478 +; CHECK-RV32-NEXT: .LBB61_963: # %cond.load1789 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 449 +; CHECK-RV32-NEXT: li a4, 448 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 2 +; CHECK-RV32-NEXT: bnez a2, .LBB61_964 +; CHECK-RV32-NEXT: j .LBB61_479 +; CHECK-RV32-NEXT: .LBB61_964: # %cond.load1793 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 450 +; CHECK-RV32-NEXT: li a4, 449 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 4 +; CHECK-RV32-NEXT: bnez a2, .LBB61_965 +; CHECK-RV32-NEXT: j .LBB61_480 +; CHECK-RV32-NEXT: .LBB61_965: # %cond.load1797 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 451 +; CHECK-RV32-NEXT: li a4, 450 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 8 +; CHECK-RV32-NEXT: bnez a2, .LBB61_966 +; CHECK-RV32-NEXT: j .LBB61_481 +; CHECK-RV32-NEXT: .LBB61_966: # %cond.load1801 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 452 +; CHECK-RV32-NEXT: li a4, 451 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 16 +; CHECK-RV32-NEXT: bnez a2, .LBB61_967 +; CHECK-RV32-NEXT: j .LBB61_482 +; CHECK-RV32-NEXT: .LBB61_967: # %cond.load1805 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 453 +; CHECK-RV32-NEXT: li a4, 452 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 32 +; CHECK-RV32-NEXT: bnez a2, .LBB61_968 +; CHECK-RV32-NEXT: j .LBB61_483 +; CHECK-RV32-NEXT: .LBB61_968: # %cond.load1809 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 454 +; CHECK-RV32-NEXT: li a4, 453 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 64 +; CHECK-RV32-NEXT: bnez a2, .LBB61_969 +; CHECK-RV32-NEXT: j .LBB61_484 +; CHECK-RV32-NEXT: .LBB61_969: # %cond.load1813 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 455 +; CHECK-RV32-NEXT: li a4, 454 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 128 +; CHECK-RV32-NEXT: bnez a2, .LBB61_970 +; CHECK-RV32-NEXT: j .LBB61_485 +; CHECK-RV32-NEXT: .LBB61_970: # %cond.load1817 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 456 +; CHECK-RV32-NEXT: li a4, 455 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 256 +; CHECK-RV32-NEXT: bnez a2, .LBB61_971 +; CHECK-RV32-NEXT: j .LBB61_486 +; CHECK-RV32-NEXT: .LBB61_971: # %cond.load1821 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 457 +; CHECK-RV32-NEXT: li a4, 456 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 512 +; CHECK-RV32-NEXT: bnez a2, .LBB61_972 +; CHECK-RV32-NEXT: j .LBB61_487 +; CHECK-RV32-NEXT: .LBB61_972: # %cond.load1825 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 458 +; CHECK-RV32-NEXT: li a4, 457 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a3, 1024 +; CHECK-RV32-NEXT: bnez a2, .LBB61_973 +; CHECK-RV32-NEXT: j .LBB61_488 +; CHECK-RV32-NEXT: .LBB61_973: # %cond.load1829 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 459 +; CHECK-RV32-NEXT: li a4, 458 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 20 +; CHECK-RV32-NEXT: bltz a2, .LBB61_974 +; CHECK-RV32-NEXT: j .LBB61_489 +; CHECK-RV32-NEXT: .LBB61_974: # %cond.load1833 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 460 +; CHECK-RV32-NEXT: li a4, 459 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 19 +; CHECK-RV32-NEXT: bltz a2, .LBB61_975 +; CHECK-RV32-NEXT: j .LBB61_490 +; CHECK-RV32-NEXT: .LBB61_975: # %cond.load1837 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 461 +; CHECK-RV32-NEXT: li a4, 460 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 18 +; CHECK-RV32-NEXT: bltz a2, .LBB61_976 +; CHECK-RV32-NEXT: j .LBB61_491 +; CHECK-RV32-NEXT: .LBB61_976: # %cond.load1841 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 462 +; CHECK-RV32-NEXT: li a4, 461 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 17 +; CHECK-RV32-NEXT: bltz a2, .LBB61_977 +; CHECK-RV32-NEXT: j .LBB61_492 +; CHECK-RV32-NEXT: .LBB61_977: # %cond.load1845 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 463 +; CHECK-RV32-NEXT: li a4, 462 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 16 +; CHECK-RV32-NEXT: bltz a2, .LBB61_978 +; CHECK-RV32-NEXT: j .LBB61_493 +; CHECK-RV32-NEXT: .LBB61_978: # %cond.load1849 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 464 +; CHECK-RV32-NEXT: li a4, 463 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 15 +; CHECK-RV32-NEXT: bltz a2, .LBB61_979 +; CHECK-RV32-NEXT: j .LBB61_494 +; CHECK-RV32-NEXT: .LBB61_979: # %cond.load1853 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 465 +; CHECK-RV32-NEXT: li a4, 464 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 14 +; CHECK-RV32-NEXT: bltz a2, .LBB61_980 +; CHECK-RV32-NEXT: j .LBB61_495 +; CHECK-RV32-NEXT: .LBB61_980: # %cond.load1857 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 466 +; CHECK-RV32-NEXT: li a4, 465 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 13 +; CHECK-RV32-NEXT: bltz a2, .LBB61_981 +; CHECK-RV32-NEXT: j .LBB61_496 +; CHECK-RV32-NEXT: .LBB61_981: # %cond.load1861 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 467 +; CHECK-RV32-NEXT: li a4, 466 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 12 +; CHECK-RV32-NEXT: bltz a2, .LBB61_982 +; CHECK-RV32-NEXT: j .LBB61_497 +; CHECK-RV32-NEXT: .LBB61_982: # %cond.load1865 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 468 +; CHECK-RV32-NEXT: li a4, 467 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 11 +; CHECK-RV32-NEXT: bltz a2, .LBB61_983 +; CHECK-RV32-NEXT: j .LBB61_498 +; CHECK-RV32-NEXT: .LBB61_983: # %cond.load1869 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 469 +; CHECK-RV32-NEXT: li a4, 468 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 10 +; CHECK-RV32-NEXT: bltz a2, .LBB61_984 +; CHECK-RV32-NEXT: j .LBB61_499 +; CHECK-RV32-NEXT: .LBB61_984: # %cond.load1873 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 470 +; CHECK-RV32-NEXT: li a4, 469 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 9 +; CHECK-RV32-NEXT: bltz a2, .LBB61_985 +; CHECK-RV32-NEXT: j .LBB61_500 +; CHECK-RV32-NEXT: .LBB61_985: # %cond.load1877 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 471 +; CHECK-RV32-NEXT: li a4, 470 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 8 +; CHECK-RV32-NEXT: bltz a2, .LBB61_986 +; CHECK-RV32-NEXT: j .LBB61_501 +; CHECK-RV32-NEXT: .LBB61_986: # %cond.load1881 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 472 +; CHECK-RV32-NEXT: li a4, 471 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 7 +; CHECK-RV32-NEXT: bltz a2, .LBB61_987 +; CHECK-RV32-NEXT: j .LBB61_502 +; CHECK-RV32-NEXT: .LBB61_987: # %cond.load1885 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 473 +; CHECK-RV32-NEXT: li a4, 472 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 6 +; CHECK-RV32-NEXT: bltz a2, .LBB61_988 +; CHECK-RV32-NEXT: j .LBB61_503 +; CHECK-RV32-NEXT: .LBB61_988: # %cond.load1889 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 474 +; CHECK-RV32-NEXT: li a4, 473 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 5 +; CHECK-RV32-NEXT: bltz a2, .LBB61_989 +; CHECK-RV32-NEXT: j .LBB61_504 +; CHECK-RV32-NEXT: .LBB61_989: # %cond.load1893 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 475 +; CHECK-RV32-NEXT: li a4, 474 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 4 +; CHECK-RV32-NEXT: bltz a2, .LBB61_990 +; CHECK-RV32-NEXT: j .LBB61_505 +; CHECK-RV32-NEXT: .LBB61_990: # %cond.load1897 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 476 +; CHECK-RV32-NEXT: li a4, 475 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 3 +; CHECK-RV32-NEXT: bltz a2, .LBB61_991 +; CHECK-RV32-NEXT: j .LBB61_506 +; CHECK-RV32-NEXT: .LBB61_991: # %cond.load1901 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a4, 512 +; CHECK-RV32-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v24, a2 +; CHECK-RV32-NEXT: li a2, 477 +; CHECK-RV32-NEXT: li a4, 476 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v24, a4 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a3, 2 +; CHECK-RV32-NEXT: bgez a2, .LBB61_1039 +; CHECK-RV32-NEXT: j .LBB61_507 +; CHECK-RV32-NEXT: .LBB61_1039: # %cond.load1901 +; CHECK-RV32-NEXT: j .LBB61_508 +; CHECK-RV32-NEXT: .LBB61_992: # %cond.load1913 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 480 +; CHECK-RV32-NEXT: li a3, 479 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 1 +; CHECK-RV32-NEXT: bnez a2, .LBB61_993 +; CHECK-RV32-NEXT: j .LBB61_512 +; CHECK-RV32-NEXT: .LBB61_993: # %cond.load1917 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 481 +; CHECK-RV32-NEXT: li a3, 480 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 2 +; CHECK-RV32-NEXT: bnez a2, .LBB61_994 +; CHECK-RV32-NEXT: j .LBB61_513 +; CHECK-RV32-NEXT: .LBB61_994: # %cond.load1921 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 482 +; CHECK-RV32-NEXT: li a3, 481 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 4 +; CHECK-RV32-NEXT: bnez a2, .LBB61_995 +; CHECK-RV32-NEXT: j .LBB61_514 +; CHECK-RV32-NEXT: .LBB61_995: # %cond.load1925 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 483 +; CHECK-RV32-NEXT: li a3, 482 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 8 +; CHECK-RV32-NEXT: bnez a2, .LBB61_996 +; CHECK-RV32-NEXT: j .LBB61_515 +; CHECK-RV32-NEXT: .LBB61_996: # %cond.load1929 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 484 +; CHECK-RV32-NEXT: li a3, 483 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 16 +; CHECK-RV32-NEXT: bnez a2, .LBB61_997 +; CHECK-RV32-NEXT: j .LBB61_516 +; CHECK-RV32-NEXT: .LBB61_997: # %cond.load1933 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 485 +; CHECK-RV32-NEXT: li a3, 484 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 32 +; CHECK-RV32-NEXT: bnez a2, .LBB61_998 +; CHECK-RV32-NEXT: j .LBB61_517 +; CHECK-RV32-NEXT: .LBB61_998: # %cond.load1937 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 486 +; CHECK-RV32-NEXT: li a3, 485 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 64 +; CHECK-RV32-NEXT: bnez a2, .LBB61_999 +; CHECK-RV32-NEXT: j .LBB61_518 +; CHECK-RV32-NEXT: .LBB61_999: # %cond.load1941 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 487 +; CHECK-RV32-NEXT: li a3, 486 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 128 +; CHECK-RV32-NEXT: bnez a2, .LBB61_1000 +; CHECK-RV32-NEXT: j .LBB61_519 +; CHECK-RV32-NEXT: .LBB61_1000: # %cond.load1945 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 488 +; CHECK-RV32-NEXT: li a3, 487 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 256 +; CHECK-RV32-NEXT: bnez a2, .LBB61_1001 +; CHECK-RV32-NEXT: j .LBB61_520 +; CHECK-RV32-NEXT: .LBB61_1001: # %cond.load1949 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 489 +; CHECK-RV32-NEXT: li a3, 488 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 512 +; CHECK-RV32-NEXT: bnez a2, .LBB61_1002 +; CHECK-RV32-NEXT: j .LBB61_521 +; CHECK-RV32-NEXT: .LBB61_1002: # %cond.load1953 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 490 +; CHECK-RV32-NEXT: li a3, 489 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: andi a2, a1, 1024 +; CHECK-RV32-NEXT: bnez a2, .LBB61_1003 +; CHECK-RV32-NEXT: j .LBB61_522 +; CHECK-RV32-NEXT: .LBB61_1003: # %cond.load1957 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 491 +; CHECK-RV32-NEXT: li a3, 490 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 20 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1004 +; CHECK-RV32-NEXT: j .LBB61_523 +; CHECK-RV32-NEXT: .LBB61_1004: # %cond.load1961 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 492 +; CHECK-RV32-NEXT: li a3, 491 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 19 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1005 +; CHECK-RV32-NEXT: j .LBB61_524 +; CHECK-RV32-NEXT: .LBB61_1005: # %cond.load1965 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 493 +; CHECK-RV32-NEXT: li a3, 492 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 18 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1006 +; CHECK-RV32-NEXT: j .LBB61_525 +; CHECK-RV32-NEXT: .LBB61_1006: # %cond.load1969 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 494 +; CHECK-RV32-NEXT: li a3, 493 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 17 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1007 +; CHECK-RV32-NEXT: j .LBB61_526 +; CHECK-RV32-NEXT: .LBB61_1007: # %cond.load1973 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 495 +; CHECK-RV32-NEXT: li a3, 494 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 16 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1008 +; CHECK-RV32-NEXT: j .LBB61_527 +; CHECK-RV32-NEXT: .LBB61_1008: # %cond.load1977 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 496 +; CHECK-RV32-NEXT: li a3, 495 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 15 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1009 +; CHECK-RV32-NEXT: j .LBB61_528 +; CHECK-RV32-NEXT: .LBB61_1009: # %cond.load1981 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 497 +; CHECK-RV32-NEXT: li a3, 496 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 14 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1010 +; CHECK-RV32-NEXT: j .LBB61_529 +; CHECK-RV32-NEXT: .LBB61_1010: # %cond.load1985 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 498 +; CHECK-RV32-NEXT: li a3, 497 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 13 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1011 +; CHECK-RV32-NEXT: j .LBB61_530 +; CHECK-RV32-NEXT: .LBB61_1011: # %cond.load1989 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 499 +; CHECK-RV32-NEXT: li a3, 498 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 12 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1012 +; CHECK-RV32-NEXT: j .LBB61_531 +; CHECK-RV32-NEXT: .LBB61_1012: # %cond.load1993 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 500 +; CHECK-RV32-NEXT: li a3, 499 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 11 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1013 +; CHECK-RV32-NEXT: j .LBB61_532 +; CHECK-RV32-NEXT: .LBB61_1013: # %cond.load1997 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 501 +; CHECK-RV32-NEXT: li a3, 500 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 10 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1014 +; CHECK-RV32-NEXT: j .LBB61_533 +; CHECK-RV32-NEXT: .LBB61_1014: # %cond.load2001 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 502 +; CHECK-RV32-NEXT: li a3, 501 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 9 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1015 +; CHECK-RV32-NEXT: j .LBB61_534 +; CHECK-RV32-NEXT: .LBB61_1015: # %cond.load2005 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 503 +; CHECK-RV32-NEXT: li a3, 502 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 8 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1016 +; CHECK-RV32-NEXT: j .LBB61_535 +; CHECK-RV32-NEXT: .LBB61_1016: # %cond.load2009 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 504 +; CHECK-RV32-NEXT: li a3, 503 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 7 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1017 +; CHECK-RV32-NEXT: j .LBB61_536 +; CHECK-RV32-NEXT: .LBB61_1017: # %cond.load2013 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 505 +; CHECK-RV32-NEXT: li a3, 504 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 6 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1018 +; CHECK-RV32-NEXT: j .LBB61_537 +; CHECK-RV32-NEXT: .LBB61_1018: # %cond.load2017 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 506 +; CHECK-RV32-NEXT: li a3, 505 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 5 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1019 +; CHECK-RV32-NEXT: j .LBB61_538 +; CHECK-RV32-NEXT: .LBB61_1019: # %cond.load2021 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 507 +; CHECK-RV32-NEXT: li a3, 506 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 4 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1020 +; CHECK-RV32-NEXT: j .LBB61_539 +; CHECK-RV32-NEXT: .LBB61_1020: # %cond.load2025 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 508 +; CHECK-RV32-NEXT: li a3, 507 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 3 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1021 +; CHECK-RV32-NEXT: j .LBB61_540 +; CHECK-RV32-NEXT: .LBB61_1021: # %cond.load2029 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 509 +; CHECK-RV32-NEXT: li a3, 508 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 2 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1022 +; CHECK-RV32-NEXT: j .LBB61_541 +; CHECK-RV32-NEXT: .LBB61_1022: # %cond.load2033 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 510 +; CHECK-RV32-NEXT: li a3, 509 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: slli a2, a1, 1 +; CHECK-RV32-NEXT: bltz a2, .LBB61_1023 +; CHECK-RV32-NEXT: j .LBB61_542 +; CHECK-RV32-NEXT: .LBB61_1023: # %cond.load2037 +; CHECK-RV32-NEXT: lbu a2, 0(a0) +; CHECK-RV32-NEXT: li a3, 512 +; CHECK-RV32-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a2 +; CHECK-RV32-NEXT: li a2, 511 +; CHECK-RV32-NEXT: li a3, 510 +; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV32-NEXT: addi a0, a0, 1 +; CHECK-RV32-NEXT: bltz a1, .LBB61_1024 +; CHECK-RV32-NEXT: j .LBB61_543 +; CHECK-RV32-NEXT: .LBB61_1024: # %cond.load2041 +; CHECK-RV32-NEXT: lbu a0, 0(a0) +; CHECK-RV32-NEXT: li a1, 512 +; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-RV32-NEXT: vmv.s.x v16, a0 +; CHECK-RV32-NEXT: li a0, 511 +; CHECK-RV32-NEXT: vslideup.vx v8, v16, a0 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_expandload_v512i8_vlen512: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a2, v0 +; CHECK-RV64-NEXT: andi a1, a2, 1 +; CHECK-RV64-NEXT: beqz a1, .LBB61_1 +; CHECK-RV64-NEXT: j .LBB61_527 +; CHECK-RV64-NEXT: .LBB61_1: # %else +; CHECK-RV64-NEXT: andi a1, a2, 2 +; CHECK-RV64-NEXT: beqz a1, .LBB61_2 +; CHECK-RV64-NEXT: j .LBB61_528 +; CHECK-RV64-NEXT: .LBB61_2: # %else2 +; CHECK-RV64-NEXT: andi a1, a2, 4 +; CHECK-RV64-NEXT: beqz a1, .LBB61_3 +; CHECK-RV64-NEXT: j .LBB61_529 +; CHECK-RV64-NEXT: .LBB61_3: # %else6 +; CHECK-RV64-NEXT: andi a1, a2, 8 +; CHECK-RV64-NEXT: beqz a1, .LBB61_4 +; CHECK-RV64-NEXT: j .LBB61_530 +; CHECK-RV64-NEXT: .LBB61_4: # %else10 +; CHECK-RV64-NEXT: andi a1, a2, 16 +; CHECK-RV64-NEXT: beqz a1, .LBB61_5 +; CHECK-RV64-NEXT: j .LBB61_531 +; CHECK-RV64-NEXT: .LBB61_5: # %else14 +; CHECK-RV64-NEXT: andi a1, a2, 32 +; CHECK-RV64-NEXT: beqz a1, .LBB61_6 +; CHECK-RV64-NEXT: j .LBB61_532 +; CHECK-RV64-NEXT: .LBB61_6: # %else18 +; CHECK-RV64-NEXT: andi a1, a2, 64 +; CHECK-RV64-NEXT: beqz a1, .LBB61_7 +; CHECK-RV64-NEXT: j .LBB61_533 +; CHECK-RV64-NEXT: .LBB61_7: # %else22 +; CHECK-RV64-NEXT: andi a1, a2, 128 +; CHECK-RV64-NEXT: beqz a1, .LBB61_8 +; CHECK-RV64-NEXT: j .LBB61_534 +; CHECK-RV64-NEXT: .LBB61_8: # %else26 +; CHECK-RV64-NEXT: andi a1, a2, 256 +; CHECK-RV64-NEXT: beqz a1, .LBB61_9 +; CHECK-RV64-NEXT: j .LBB61_535 +; CHECK-RV64-NEXT: .LBB61_9: # %else30 +; CHECK-RV64-NEXT: andi a1, a2, 512 +; CHECK-RV64-NEXT: beqz a1, .LBB61_10 +; CHECK-RV64-NEXT: j .LBB61_536 +; CHECK-RV64-NEXT: .LBB61_10: # %else34 +; CHECK-RV64-NEXT: andi a1, a2, 1024 +; CHECK-RV64-NEXT: beqz a1, .LBB61_11 +; CHECK-RV64-NEXT: j .LBB61_537 +; CHECK-RV64-NEXT: .LBB61_11: # %else38 +; CHECK-RV64-NEXT: slli a1, a2, 52 +; CHECK-RV64-NEXT: bgez a1, .LBB61_12 +; CHECK-RV64-NEXT: j .LBB61_538 +; CHECK-RV64-NEXT: .LBB61_12: # %else42 +; CHECK-RV64-NEXT: slli a1, a2, 51 +; CHECK-RV64-NEXT: bgez a1, .LBB61_13 +; CHECK-RV64-NEXT: j .LBB61_539 +; CHECK-RV64-NEXT: .LBB61_13: # %else46 +; CHECK-RV64-NEXT: slli a1, a2, 50 +; CHECK-RV64-NEXT: bgez a1, .LBB61_14 +; CHECK-RV64-NEXT: j .LBB61_540 +; CHECK-RV64-NEXT: .LBB61_14: # %else50 +; CHECK-RV64-NEXT: slli a1, a2, 49 +; CHECK-RV64-NEXT: bgez a1, .LBB61_15 +; CHECK-RV64-NEXT: j .LBB61_541 +; CHECK-RV64-NEXT: .LBB61_15: # %else54 +; CHECK-RV64-NEXT: slli a1, a2, 48 +; CHECK-RV64-NEXT: bgez a1, .LBB61_16 +; CHECK-RV64-NEXT: j .LBB61_542 +; CHECK-RV64-NEXT: .LBB61_16: # %else58 +; CHECK-RV64-NEXT: slli a1, a2, 47 +; CHECK-RV64-NEXT: bgez a1, .LBB61_17 +; CHECK-RV64-NEXT: j .LBB61_543 +; CHECK-RV64-NEXT: .LBB61_17: # %else62 +; CHECK-RV64-NEXT: slli a1, a2, 46 +; CHECK-RV64-NEXT: bgez a1, .LBB61_18 +; CHECK-RV64-NEXT: j .LBB61_544 +; CHECK-RV64-NEXT: .LBB61_18: # %else66 +; CHECK-RV64-NEXT: slli a1, a2, 45 +; CHECK-RV64-NEXT: bgez a1, .LBB61_19 +; CHECK-RV64-NEXT: j .LBB61_545 +; CHECK-RV64-NEXT: .LBB61_19: # %else70 +; CHECK-RV64-NEXT: slli a1, a2, 44 +; CHECK-RV64-NEXT: bgez a1, .LBB61_20 +; CHECK-RV64-NEXT: j .LBB61_546 +; CHECK-RV64-NEXT: .LBB61_20: # %else74 +; CHECK-RV64-NEXT: slli a1, a2, 43 +; CHECK-RV64-NEXT: bgez a1, .LBB61_21 +; CHECK-RV64-NEXT: j .LBB61_547 +; CHECK-RV64-NEXT: .LBB61_21: # %else78 +; CHECK-RV64-NEXT: slli a1, a2, 42 +; CHECK-RV64-NEXT: bgez a1, .LBB61_22 +; CHECK-RV64-NEXT: j .LBB61_548 +; CHECK-RV64-NEXT: .LBB61_22: # %else82 +; CHECK-RV64-NEXT: slli a1, a2, 41 +; CHECK-RV64-NEXT: bgez a1, .LBB61_23 +; CHECK-RV64-NEXT: j .LBB61_549 +; CHECK-RV64-NEXT: .LBB61_23: # %else86 +; CHECK-RV64-NEXT: slli a1, a2, 40 +; CHECK-RV64-NEXT: bgez a1, .LBB61_24 +; CHECK-RV64-NEXT: j .LBB61_550 +; CHECK-RV64-NEXT: .LBB61_24: # %else90 +; CHECK-RV64-NEXT: slli a1, a2, 39 +; CHECK-RV64-NEXT: bgez a1, .LBB61_25 +; CHECK-RV64-NEXT: j .LBB61_551 +; CHECK-RV64-NEXT: .LBB61_25: # %else94 +; CHECK-RV64-NEXT: slli a1, a2, 38 +; CHECK-RV64-NEXT: bgez a1, .LBB61_26 +; CHECK-RV64-NEXT: j .LBB61_552 +; CHECK-RV64-NEXT: .LBB61_26: # %else98 +; CHECK-RV64-NEXT: slli a1, a2, 37 +; CHECK-RV64-NEXT: bgez a1, .LBB61_27 +; CHECK-RV64-NEXT: j .LBB61_553 +; CHECK-RV64-NEXT: .LBB61_27: # %else102 +; CHECK-RV64-NEXT: slli a1, a2, 36 +; CHECK-RV64-NEXT: bgez a1, .LBB61_28 +; CHECK-RV64-NEXT: j .LBB61_554 +; CHECK-RV64-NEXT: .LBB61_28: # %else106 +; CHECK-RV64-NEXT: slli a1, a2, 35 +; CHECK-RV64-NEXT: bgez a1, .LBB61_29 +; CHECK-RV64-NEXT: j .LBB61_555 +; CHECK-RV64-NEXT: .LBB61_29: # %else110 +; CHECK-RV64-NEXT: slli a1, a2, 34 +; CHECK-RV64-NEXT: bgez a1, .LBB61_30 +; CHECK-RV64-NEXT: j .LBB61_556 +; CHECK-RV64-NEXT: .LBB61_30: # %else114 +; CHECK-RV64-NEXT: slli a1, a2, 33 +; CHECK-RV64-NEXT: bgez a1, .LBB61_31 +; CHECK-RV64-NEXT: j .LBB61_557 +; CHECK-RV64-NEXT: .LBB61_31: # %else118 +; CHECK-RV64-NEXT: slli a1, a2, 32 +; CHECK-RV64-NEXT: bgez a1, .LBB61_32 +; CHECK-RV64-NEXT: j .LBB61_558 +; CHECK-RV64-NEXT: .LBB61_32: # %else122 +; CHECK-RV64-NEXT: slli a1, a2, 31 +; CHECK-RV64-NEXT: bgez a1, .LBB61_33 +; CHECK-RV64-NEXT: j .LBB61_559 +; CHECK-RV64-NEXT: .LBB61_33: # %else126 +; CHECK-RV64-NEXT: slli a1, a2, 30 +; CHECK-RV64-NEXT: bgez a1, .LBB61_34 +; CHECK-RV64-NEXT: j .LBB61_560 +; CHECK-RV64-NEXT: .LBB61_34: # %else130 +; CHECK-RV64-NEXT: slli a1, a2, 29 +; CHECK-RV64-NEXT: bgez a1, .LBB61_35 +; CHECK-RV64-NEXT: j .LBB61_561 +; CHECK-RV64-NEXT: .LBB61_35: # %else134 +; CHECK-RV64-NEXT: slli a1, a2, 28 +; CHECK-RV64-NEXT: bgez a1, .LBB61_36 +; CHECK-RV64-NEXT: j .LBB61_562 +; CHECK-RV64-NEXT: .LBB61_36: # %else138 +; CHECK-RV64-NEXT: slli a1, a2, 27 +; CHECK-RV64-NEXT: bgez a1, .LBB61_37 +; CHECK-RV64-NEXT: j .LBB61_563 +; CHECK-RV64-NEXT: .LBB61_37: # %else142 +; CHECK-RV64-NEXT: slli a1, a2, 26 +; CHECK-RV64-NEXT: bgez a1, .LBB61_38 +; CHECK-RV64-NEXT: j .LBB61_564 +; CHECK-RV64-NEXT: .LBB61_38: # %else146 +; CHECK-RV64-NEXT: slli a1, a2, 25 +; CHECK-RV64-NEXT: bgez a1, .LBB61_39 +; CHECK-RV64-NEXT: j .LBB61_565 +; CHECK-RV64-NEXT: .LBB61_39: # %else150 +; CHECK-RV64-NEXT: slli a1, a2, 24 +; CHECK-RV64-NEXT: bgez a1, .LBB61_40 +; CHECK-RV64-NEXT: j .LBB61_566 +; CHECK-RV64-NEXT: .LBB61_40: # %else154 +; CHECK-RV64-NEXT: slli a1, a2, 23 +; CHECK-RV64-NEXT: bgez a1, .LBB61_41 +; CHECK-RV64-NEXT: j .LBB61_567 +; CHECK-RV64-NEXT: .LBB61_41: # %else158 +; CHECK-RV64-NEXT: slli a1, a2, 22 +; CHECK-RV64-NEXT: bgez a1, .LBB61_42 +; CHECK-RV64-NEXT: j .LBB61_568 +; CHECK-RV64-NEXT: .LBB61_42: # %else162 +; CHECK-RV64-NEXT: slli a1, a2, 21 +; CHECK-RV64-NEXT: bgez a1, .LBB61_43 +; CHECK-RV64-NEXT: j .LBB61_569 +; CHECK-RV64-NEXT: .LBB61_43: # %else166 +; CHECK-RV64-NEXT: slli a1, a2, 20 +; CHECK-RV64-NEXT: bgez a1, .LBB61_44 +; CHECK-RV64-NEXT: j .LBB61_570 +; CHECK-RV64-NEXT: .LBB61_44: # %else170 +; CHECK-RV64-NEXT: slli a1, a2, 19 +; CHECK-RV64-NEXT: bgez a1, .LBB61_45 +; CHECK-RV64-NEXT: j .LBB61_571 +; CHECK-RV64-NEXT: .LBB61_45: # %else174 +; CHECK-RV64-NEXT: slli a1, a2, 18 +; CHECK-RV64-NEXT: bgez a1, .LBB61_46 +; CHECK-RV64-NEXT: j .LBB61_572 +; CHECK-RV64-NEXT: .LBB61_46: # %else178 +; CHECK-RV64-NEXT: slli a1, a2, 17 +; CHECK-RV64-NEXT: bgez a1, .LBB61_47 +; CHECK-RV64-NEXT: j .LBB61_573 +; CHECK-RV64-NEXT: .LBB61_47: # %else182 +; CHECK-RV64-NEXT: slli a1, a2, 16 +; CHECK-RV64-NEXT: bgez a1, .LBB61_48 +; CHECK-RV64-NEXT: j .LBB61_574 +; CHECK-RV64-NEXT: .LBB61_48: # %else186 +; CHECK-RV64-NEXT: slli a1, a2, 15 +; CHECK-RV64-NEXT: bgez a1, .LBB61_49 +; CHECK-RV64-NEXT: j .LBB61_575 +; CHECK-RV64-NEXT: .LBB61_49: # %else190 +; CHECK-RV64-NEXT: slli a1, a2, 14 +; CHECK-RV64-NEXT: bgez a1, .LBB61_50 +; CHECK-RV64-NEXT: j .LBB61_576 +; CHECK-RV64-NEXT: .LBB61_50: # %else194 +; CHECK-RV64-NEXT: slli a1, a2, 13 +; CHECK-RV64-NEXT: bgez a1, .LBB61_51 +; CHECK-RV64-NEXT: j .LBB61_577 +; CHECK-RV64-NEXT: .LBB61_51: # %else198 +; CHECK-RV64-NEXT: slli a1, a2, 12 +; CHECK-RV64-NEXT: bgez a1, .LBB61_52 +; CHECK-RV64-NEXT: j .LBB61_578 +; CHECK-RV64-NEXT: .LBB61_52: # %else202 +; CHECK-RV64-NEXT: slli a1, a2, 11 +; CHECK-RV64-NEXT: bgez a1, .LBB61_53 +; CHECK-RV64-NEXT: j .LBB61_579 +; CHECK-RV64-NEXT: .LBB61_53: # %else206 +; CHECK-RV64-NEXT: slli a1, a2, 10 +; CHECK-RV64-NEXT: bgez a1, .LBB61_54 +; CHECK-RV64-NEXT: j .LBB61_580 +; CHECK-RV64-NEXT: .LBB61_54: # %else210 +; CHECK-RV64-NEXT: slli a1, a2, 9 +; CHECK-RV64-NEXT: bgez a1, .LBB61_55 +; CHECK-RV64-NEXT: j .LBB61_581 +; CHECK-RV64-NEXT: .LBB61_55: # %else214 +; CHECK-RV64-NEXT: slli a1, a2, 8 +; CHECK-RV64-NEXT: bgez a1, .LBB61_56 +; CHECK-RV64-NEXT: j .LBB61_582 +; CHECK-RV64-NEXT: .LBB61_56: # %else218 +; CHECK-RV64-NEXT: slli a1, a2, 7 +; CHECK-RV64-NEXT: bgez a1, .LBB61_57 +; CHECK-RV64-NEXT: j .LBB61_583 +; CHECK-RV64-NEXT: .LBB61_57: # %else222 +; CHECK-RV64-NEXT: slli a1, a2, 6 +; CHECK-RV64-NEXT: bgez a1, .LBB61_58 +; CHECK-RV64-NEXT: j .LBB61_584 +; CHECK-RV64-NEXT: .LBB61_58: # %else226 +; CHECK-RV64-NEXT: slli a1, a2, 5 +; CHECK-RV64-NEXT: bgez a1, .LBB61_59 +; CHECK-RV64-NEXT: j .LBB61_585 +; CHECK-RV64-NEXT: .LBB61_59: # %else230 +; CHECK-RV64-NEXT: slli a1, a2, 4 +; CHECK-RV64-NEXT: bgez a1, .LBB61_60 +; CHECK-RV64-NEXT: j .LBB61_586 +; CHECK-RV64-NEXT: .LBB61_60: # %else234 +; CHECK-RV64-NEXT: slli a1, a2, 3 +; CHECK-RV64-NEXT: bgez a1, .LBB61_61 +; CHECK-RV64-NEXT: j .LBB61_587 +; CHECK-RV64-NEXT: .LBB61_61: # %else238 +; CHECK-RV64-NEXT: slli a1, a2, 2 +; CHECK-RV64-NEXT: bgez a1, .LBB61_63 +; CHECK-RV64-NEXT: .LBB61_62: # %cond.load241 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 62 +; CHECK-RV64-NEXT: li a3, 61 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: .LBB61_63: # %else242 +; CHECK-RV64-NEXT: slli a1, a2, 1 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 1 +; CHECK-RV64-NEXT: bgez a1, .LBB61_65 +; CHECK-RV64-NEXT: # %bb.64: # %cond.load245 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v17, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 63 +; CHECK-RV64-NEXT: li a3, 62 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v17, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: .LBB61_65: # %else246 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a1, v16 +; CHECK-RV64-NEXT: bgez a2, .LBB61_66 +; CHECK-RV64-NEXT: j .LBB61_588 +; CHECK-RV64-NEXT: .LBB61_66: # %else250 +; CHECK-RV64-NEXT: andi a2, a1, 1 +; CHECK-RV64-NEXT: beqz a2, .LBB61_67 +; CHECK-RV64-NEXT: j .LBB61_589 +; CHECK-RV64-NEXT: .LBB61_67: # %else254 +; CHECK-RV64-NEXT: andi a2, a1, 2 +; CHECK-RV64-NEXT: beqz a2, .LBB61_68 +; CHECK-RV64-NEXT: j .LBB61_590 +; CHECK-RV64-NEXT: .LBB61_68: # %else258 +; CHECK-RV64-NEXT: andi a2, a1, 4 +; CHECK-RV64-NEXT: beqz a2, .LBB61_69 +; CHECK-RV64-NEXT: j .LBB61_591 +; CHECK-RV64-NEXT: .LBB61_69: # %else262 +; CHECK-RV64-NEXT: andi a2, a1, 8 +; CHECK-RV64-NEXT: beqz a2, .LBB61_70 +; CHECK-RV64-NEXT: j .LBB61_592 +; CHECK-RV64-NEXT: .LBB61_70: # %else266 +; CHECK-RV64-NEXT: andi a2, a1, 16 +; CHECK-RV64-NEXT: beqz a2, .LBB61_71 +; CHECK-RV64-NEXT: j .LBB61_593 +; CHECK-RV64-NEXT: .LBB61_71: # %else270 +; CHECK-RV64-NEXT: andi a2, a1, 32 +; CHECK-RV64-NEXT: beqz a2, .LBB61_72 +; CHECK-RV64-NEXT: j .LBB61_594 +; CHECK-RV64-NEXT: .LBB61_72: # %else274 +; CHECK-RV64-NEXT: andi a2, a1, 64 +; CHECK-RV64-NEXT: beqz a2, .LBB61_73 +; CHECK-RV64-NEXT: j .LBB61_595 +; CHECK-RV64-NEXT: .LBB61_73: # %else278 +; CHECK-RV64-NEXT: andi a2, a1, 128 +; CHECK-RV64-NEXT: beqz a2, .LBB61_74 +; CHECK-RV64-NEXT: j .LBB61_596 +; CHECK-RV64-NEXT: .LBB61_74: # %else282 +; CHECK-RV64-NEXT: andi a2, a1, 256 +; CHECK-RV64-NEXT: beqz a2, .LBB61_75 +; CHECK-RV64-NEXT: j .LBB61_597 +; CHECK-RV64-NEXT: .LBB61_75: # %else286 +; CHECK-RV64-NEXT: andi a2, a1, 512 +; CHECK-RV64-NEXT: beqz a2, .LBB61_76 +; CHECK-RV64-NEXT: j .LBB61_598 +; CHECK-RV64-NEXT: .LBB61_76: # %else290 +; CHECK-RV64-NEXT: andi a2, a1, 1024 +; CHECK-RV64-NEXT: beqz a2, .LBB61_77 +; CHECK-RV64-NEXT: j .LBB61_599 +; CHECK-RV64-NEXT: .LBB61_77: # %else294 +; CHECK-RV64-NEXT: slli a2, a1, 52 +; CHECK-RV64-NEXT: bgez a2, .LBB61_78 +; CHECK-RV64-NEXT: j .LBB61_600 +; CHECK-RV64-NEXT: .LBB61_78: # %else298 +; CHECK-RV64-NEXT: slli a2, a1, 51 +; CHECK-RV64-NEXT: bgez a2, .LBB61_79 +; CHECK-RV64-NEXT: j .LBB61_601 +; CHECK-RV64-NEXT: .LBB61_79: # %else302 +; CHECK-RV64-NEXT: slli a2, a1, 50 +; CHECK-RV64-NEXT: bgez a2, .LBB61_80 +; CHECK-RV64-NEXT: j .LBB61_602 +; CHECK-RV64-NEXT: .LBB61_80: # %else306 +; CHECK-RV64-NEXT: slli a2, a1, 49 +; CHECK-RV64-NEXT: bgez a2, .LBB61_81 +; CHECK-RV64-NEXT: j .LBB61_603 +; CHECK-RV64-NEXT: .LBB61_81: # %else310 +; CHECK-RV64-NEXT: slli a2, a1, 48 +; CHECK-RV64-NEXT: bgez a2, .LBB61_82 +; CHECK-RV64-NEXT: j .LBB61_604 +; CHECK-RV64-NEXT: .LBB61_82: # %else314 +; CHECK-RV64-NEXT: slli a2, a1, 47 +; CHECK-RV64-NEXT: bgez a2, .LBB61_83 +; CHECK-RV64-NEXT: j .LBB61_605 +; CHECK-RV64-NEXT: .LBB61_83: # %else318 +; CHECK-RV64-NEXT: slli a2, a1, 46 +; CHECK-RV64-NEXT: bgez a2, .LBB61_84 +; CHECK-RV64-NEXT: j .LBB61_606 +; CHECK-RV64-NEXT: .LBB61_84: # %else322 +; CHECK-RV64-NEXT: slli a2, a1, 45 +; CHECK-RV64-NEXT: bgez a2, .LBB61_85 +; CHECK-RV64-NEXT: j .LBB61_607 +; CHECK-RV64-NEXT: .LBB61_85: # %else326 +; CHECK-RV64-NEXT: slli a2, a1, 44 +; CHECK-RV64-NEXT: bgez a2, .LBB61_86 +; CHECK-RV64-NEXT: j .LBB61_608 +; CHECK-RV64-NEXT: .LBB61_86: # %else330 +; CHECK-RV64-NEXT: slli a2, a1, 43 +; CHECK-RV64-NEXT: bgez a2, .LBB61_87 +; CHECK-RV64-NEXT: j .LBB61_609 +; CHECK-RV64-NEXT: .LBB61_87: # %else334 +; CHECK-RV64-NEXT: slli a2, a1, 42 +; CHECK-RV64-NEXT: bgez a2, .LBB61_88 +; CHECK-RV64-NEXT: j .LBB61_610 +; CHECK-RV64-NEXT: .LBB61_88: # %else338 +; CHECK-RV64-NEXT: slli a2, a1, 41 +; CHECK-RV64-NEXT: bgez a2, .LBB61_89 +; CHECK-RV64-NEXT: j .LBB61_611 +; CHECK-RV64-NEXT: .LBB61_89: # %else342 +; CHECK-RV64-NEXT: slli a2, a1, 40 +; CHECK-RV64-NEXT: bgez a2, .LBB61_90 +; CHECK-RV64-NEXT: j .LBB61_612 +; CHECK-RV64-NEXT: .LBB61_90: # %else346 +; CHECK-RV64-NEXT: slli a2, a1, 39 +; CHECK-RV64-NEXT: bgez a2, .LBB61_91 +; CHECK-RV64-NEXT: j .LBB61_613 +; CHECK-RV64-NEXT: .LBB61_91: # %else350 +; CHECK-RV64-NEXT: slli a2, a1, 38 +; CHECK-RV64-NEXT: bgez a2, .LBB61_92 +; CHECK-RV64-NEXT: j .LBB61_614 +; CHECK-RV64-NEXT: .LBB61_92: # %else354 +; CHECK-RV64-NEXT: slli a2, a1, 37 +; CHECK-RV64-NEXT: bgez a2, .LBB61_93 +; CHECK-RV64-NEXT: j .LBB61_615 +; CHECK-RV64-NEXT: .LBB61_93: # %else358 +; CHECK-RV64-NEXT: slli a2, a1, 36 +; CHECK-RV64-NEXT: bgez a2, .LBB61_94 +; CHECK-RV64-NEXT: j .LBB61_616 +; CHECK-RV64-NEXT: .LBB61_94: # %else362 +; CHECK-RV64-NEXT: slli a2, a1, 35 +; CHECK-RV64-NEXT: bgez a2, .LBB61_95 +; CHECK-RV64-NEXT: j .LBB61_617 +; CHECK-RV64-NEXT: .LBB61_95: # %else366 +; CHECK-RV64-NEXT: slli a2, a1, 34 +; CHECK-RV64-NEXT: bgez a2, .LBB61_96 +; CHECK-RV64-NEXT: j .LBB61_618 +; CHECK-RV64-NEXT: .LBB61_96: # %else370 +; CHECK-RV64-NEXT: slli a2, a1, 33 +; CHECK-RV64-NEXT: bgez a2, .LBB61_97 +; CHECK-RV64-NEXT: j .LBB61_619 +; CHECK-RV64-NEXT: .LBB61_97: # %else374 +; CHECK-RV64-NEXT: slli a2, a1, 32 +; CHECK-RV64-NEXT: bgez a2, .LBB61_98 +; CHECK-RV64-NEXT: j .LBB61_620 +; CHECK-RV64-NEXT: .LBB61_98: # %else378 +; CHECK-RV64-NEXT: slli a2, a1, 31 +; CHECK-RV64-NEXT: bgez a2, .LBB61_99 +; CHECK-RV64-NEXT: j .LBB61_621 +; CHECK-RV64-NEXT: .LBB61_99: # %else382 +; CHECK-RV64-NEXT: slli a2, a1, 30 +; CHECK-RV64-NEXT: bgez a2, .LBB61_100 +; CHECK-RV64-NEXT: j .LBB61_622 +; CHECK-RV64-NEXT: .LBB61_100: # %else386 +; CHECK-RV64-NEXT: slli a2, a1, 29 +; CHECK-RV64-NEXT: bgez a2, .LBB61_101 +; CHECK-RV64-NEXT: j .LBB61_623 +; CHECK-RV64-NEXT: .LBB61_101: # %else390 +; CHECK-RV64-NEXT: slli a2, a1, 28 +; CHECK-RV64-NEXT: bgez a2, .LBB61_102 +; CHECK-RV64-NEXT: j .LBB61_624 +; CHECK-RV64-NEXT: .LBB61_102: # %else394 +; CHECK-RV64-NEXT: slli a2, a1, 27 +; CHECK-RV64-NEXT: bgez a2, .LBB61_103 +; CHECK-RV64-NEXT: j .LBB61_625 +; CHECK-RV64-NEXT: .LBB61_103: # %else398 +; CHECK-RV64-NEXT: slli a2, a1, 26 +; CHECK-RV64-NEXT: bgez a2, .LBB61_104 +; CHECK-RV64-NEXT: j .LBB61_626 +; CHECK-RV64-NEXT: .LBB61_104: # %else402 +; CHECK-RV64-NEXT: slli a2, a1, 25 +; CHECK-RV64-NEXT: bgez a2, .LBB61_105 +; CHECK-RV64-NEXT: j .LBB61_627 +; CHECK-RV64-NEXT: .LBB61_105: # %else406 +; CHECK-RV64-NEXT: slli a2, a1, 24 +; CHECK-RV64-NEXT: bgez a2, .LBB61_106 +; CHECK-RV64-NEXT: j .LBB61_628 +; CHECK-RV64-NEXT: .LBB61_106: # %else410 +; CHECK-RV64-NEXT: slli a2, a1, 23 +; CHECK-RV64-NEXT: bgez a2, .LBB61_107 +; CHECK-RV64-NEXT: j .LBB61_629 +; CHECK-RV64-NEXT: .LBB61_107: # %else414 +; CHECK-RV64-NEXT: slli a2, a1, 22 +; CHECK-RV64-NEXT: bgez a2, .LBB61_108 +; CHECK-RV64-NEXT: j .LBB61_630 +; CHECK-RV64-NEXT: .LBB61_108: # %else418 +; CHECK-RV64-NEXT: slli a2, a1, 21 +; CHECK-RV64-NEXT: bgez a2, .LBB61_109 +; CHECK-RV64-NEXT: j .LBB61_631 +; CHECK-RV64-NEXT: .LBB61_109: # %else422 +; CHECK-RV64-NEXT: slli a2, a1, 20 +; CHECK-RV64-NEXT: bgez a2, .LBB61_110 +; CHECK-RV64-NEXT: j .LBB61_632 +; CHECK-RV64-NEXT: .LBB61_110: # %else426 +; CHECK-RV64-NEXT: slli a2, a1, 19 +; CHECK-RV64-NEXT: bgez a2, .LBB61_111 +; CHECK-RV64-NEXT: j .LBB61_633 +; CHECK-RV64-NEXT: .LBB61_111: # %else430 +; CHECK-RV64-NEXT: slli a2, a1, 18 +; CHECK-RV64-NEXT: bgez a2, .LBB61_112 +; CHECK-RV64-NEXT: j .LBB61_634 +; CHECK-RV64-NEXT: .LBB61_112: # %else434 +; CHECK-RV64-NEXT: slli a2, a1, 17 +; CHECK-RV64-NEXT: bgez a2, .LBB61_113 +; CHECK-RV64-NEXT: j .LBB61_635 +; CHECK-RV64-NEXT: .LBB61_113: # %else438 +; CHECK-RV64-NEXT: slli a2, a1, 16 +; CHECK-RV64-NEXT: bgez a2, .LBB61_114 +; CHECK-RV64-NEXT: j .LBB61_636 +; CHECK-RV64-NEXT: .LBB61_114: # %else442 +; CHECK-RV64-NEXT: slli a2, a1, 15 +; CHECK-RV64-NEXT: bgez a2, .LBB61_115 +; CHECK-RV64-NEXT: j .LBB61_637 +; CHECK-RV64-NEXT: .LBB61_115: # %else446 +; CHECK-RV64-NEXT: slli a2, a1, 14 +; CHECK-RV64-NEXT: bgez a2, .LBB61_116 +; CHECK-RV64-NEXT: j .LBB61_638 +; CHECK-RV64-NEXT: .LBB61_116: # %else450 +; CHECK-RV64-NEXT: slli a2, a1, 13 +; CHECK-RV64-NEXT: bgez a2, .LBB61_117 +; CHECK-RV64-NEXT: j .LBB61_639 +; CHECK-RV64-NEXT: .LBB61_117: # %else454 +; CHECK-RV64-NEXT: slli a2, a1, 12 +; CHECK-RV64-NEXT: bgez a2, .LBB61_118 +; CHECK-RV64-NEXT: j .LBB61_640 +; CHECK-RV64-NEXT: .LBB61_118: # %else458 +; CHECK-RV64-NEXT: slli a2, a1, 11 +; CHECK-RV64-NEXT: bgez a2, .LBB61_119 +; CHECK-RV64-NEXT: j .LBB61_641 +; CHECK-RV64-NEXT: .LBB61_119: # %else462 +; CHECK-RV64-NEXT: slli a2, a1, 10 +; CHECK-RV64-NEXT: bgez a2, .LBB61_120 +; CHECK-RV64-NEXT: j .LBB61_642 +; CHECK-RV64-NEXT: .LBB61_120: # %else466 +; CHECK-RV64-NEXT: slli a2, a1, 9 +; CHECK-RV64-NEXT: bgez a2, .LBB61_121 +; CHECK-RV64-NEXT: j .LBB61_643 +; CHECK-RV64-NEXT: .LBB61_121: # %else470 +; CHECK-RV64-NEXT: slli a2, a1, 8 +; CHECK-RV64-NEXT: bgez a2, .LBB61_122 +; CHECK-RV64-NEXT: j .LBB61_644 +; CHECK-RV64-NEXT: .LBB61_122: # %else474 +; CHECK-RV64-NEXT: slli a2, a1, 7 +; CHECK-RV64-NEXT: bgez a2, .LBB61_123 +; CHECK-RV64-NEXT: j .LBB61_645 +; CHECK-RV64-NEXT: .LBB61_123: # %else478 +; CHECK-RV64-NEXT: slli a2, a1, 6 +; CHECK-RV64-NEXT: bgez a2, .LBB61_124 +; CHECK-RV64-NEXT: j .LBB61_646 +; CHECK-RV64-NEXT: .LBB61_124: # %else482 +; CHECK-RV64-NEXT: slli a2, a1, 5 +; CHECK-RV64-NEXT: bgez a2, .LBB61_125 +; CHECK-RV64-NEXT: j .LBB61_647 +; CHECK-RV64-NEXT: .LBB61_125: # %else486 +; CHECK-RV64-NEXT: slli a2, a1, 4 +; CHECK-RV64-NEXT: bgez a2, .LBB61_126 +; CHECK-RV64-NEXT: j .LBB61_648 +; CHECK-RV64-NEXT: .LBB61_126: # %else490 +; CHECK-RV64-NEXT: slli a2, a1, 3 +; CHECK-RV64-NEXT: bgez a2, .LBB61_127 +; CHECK-RV64-NEXT: j .LBB61_649 +; CHECK-RV64-NEXT: .LBB61_127: # %else494 +; CHECK-RV64-NEXT: slli a2, a1, 2 +; CHECK-RV64-NEXT: bgez a2, .LBB61_129 +; CHECK-RV64-NEXT: .LBB61_128: # %cond.load497 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 126 +; CHECK-RV64-NEXT: li a3, 125 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: .LBB61_129: # %else498 +; CHECK-RV64-NEXT: slli a2, a1, 1 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 2 +; CHECK-RV64-NEXT: bgez a2, .LBB61_131 +; CHECK-RV64-NEXT: # %bb.130: # %cond.load501 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v18, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 127 +; CHECK-RV64-NEXT: li a3, 126 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v18, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: .LBB61_131: # %else502 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a2, v16 +; CHECK-RV64-NEXT: bgez a1, .LBB61_132 +; CHECK-RV64-NEXT: j .LBB61_650 +; CHECK-RV64-NEXT: .LBB61_132: # %else506 +; CHECK-RV64-NEXT: andi a1, a2, 1 +; CHECK-RV64-NEXT: beqz a1, .LBB61_133 +; CHECK-RV64-NEXT: j .LBB61_651 +; CHECK-RV64-NEXT: .LBB61_133: # %else510 +; CHECK-RV64-NEXT: andi a1, a2, 2 +; CHECK-RV64-NEXT: beqz a1, .LBB61_134 +; CHECK-RV64-NEXT: j .LBB61_652 +; CHECK-RV64-NEXT: .LBB61_134: # %else514 +; CHECK-RV64-NEXT: andi a1, a2, 4 +; CHECK-RV64-NEXT: beqz a1, .LBB61_135 +; CHECK-RV64-NEXT: j .LBB61_653 +; CHECK-RV64-NEXT: .LBB61_135: # %else518 +; CHECK-RV64-NEXT: andi a1, a2, 8 +; CHECK-RV64-NEXT: beqz a1, .LBB61_136 +; CHECK-RV64-NEXT: j .LBB61_654 +; CHECK-RV64-NEXT: .LBB61_136: # %else522 +; CHECK-RV64-NEXT: andi a1, a2, 16 +; CHECK-RV64-NEXT: beqz a1, .LBB61_137 +; CHECK-RV64-NEXT: j .LBB61_655 +; CHECK-RV64-NEXT: .LBB61_137: # %else526 +; CHECK-RV64-NEXT: andi a1, a2, 32 +; CHECK-RV64-NEXT: beqz a1, .LBB61_138 +; CHECK-RV64-NEXT: j .LBB61_656 +; CHECK-RV64-NEXT: .LBB61_138: # %else530 +; CHECK-RV64-NEXT: andi a1, a2, 64 +; CHECK-RV64-NEXT: beqz a1, .LBB61_139 +; CHECK-RV64-NEXT: j .LBB61_657 +; CHECK-RV64-NEXT: .LBB61_139: # %else534 +; CHECK-RV64-NEXT: andi a1, a2, 128 +; CHECK-RV64-NEXT: beqz a1, .LBB61_140 +; CHECK-RV64-NEXT: j .LBB61_658 +; CHECK-RV64-NEXT: .LBB61_140: # %else538 +; CHECK-RV64-NEXT: andi a1, a2, 256 +; CHECK-RV64-NEXT: beqz a1, .LBB61_141 +; CHECK-RV64-NEXT: j .LBB61_659 +; CHECK-RV64-NEXT: .LBB61_141: # %else542 +; CHECK-RV64-NEXT: andi a1, a2, 512 +; CHECK-RV64-NEXT: beqz a1, .LBB61_142 +; CHECK-RV64-NEXT: j .LBB61_660 +; CHECK-RV64-NEXT: .LBB61_142: # %else546 +; CHECK-RV64-NEXT: andi a1, a2, 1024 +; CHECK-RV64-NEXT: beqz a1, .LBB61_143 +; CHECK-RV64-NEXT: j .LBB61_661 +; CHECK-RV64-NEXT: .LBB61_143: # %else550 +; CHECK-RV64-NEXT: slli a1, a2, 52 +; CHECK-RV64-NEXT: bgez a1, .LBB61_144 +; CHECK-RV64-NEXT: j .LBB61_662 +; CHECK-RV64-NEXT: .LBB61_144: # %else554 +; CHECK-RV64-NEXT: slli a1, a2, 51 +; CHECK-RV64-NEXT: bgez a1, .LBB61_145 +; CHECK-RV64-NEXT: j .LBB61_663 +; CHECK-RV64-NEXT: .LBB61_145: # %else558 +; CHECK-RV64-NEXT: slli a1, a2, 50 +; CHECK-RV64-NEXT: bgez a1, .LBB61_146 +; CHECK-RV64-NEXT: j .LBB61_664 +; CHECK-RV64-NEXT: .LBB61_146: # %else562 +; CHECK-RV64-NEXT: slli a1, a2, 49 +; CHECK-RV64-NEXT: bgez a1, .LBB61_147 +; CHECK-RV64-NEXT: j .LBB61_665 +; CHECK-RV64-NEXT: .LBB61_147: # %else566 +; CHECK-RV64-NEXT: slli a1, a2, 48 +; CHECK-RV64-NEXT: bgez a1, .LBB61_148 +; CHECK-RV64-NEXT: j .LBB61_666 +; CHECK-RV64-NEXT: .LBB61_148: # %else570 +; CHECK-RV64-NEXT: slli a1, a2, 47 +; CHECK-RV64-NEXT: bgez a1, .LBB61_149 +; CHECK-RV64-NEXT: j .LBB61_667 +; CHECK-RV64-NEXT: .LBB61_149: # %else574 +; CHECK-RV64-NEXT: slli a1, a2, 46 +; CHECK-RV64-NEXT: bgez a1, .LBB61_150 +; CHECK-RV64-NEXT: j .LBB61_668 +; CHECK-RV64-NEXT: .LBB61_150: # %else578 +; CHECK-RV64-NEXT: slli a1, a2, 45 +; CHECK-RV64-NEXT: bgez a1, .LBB61_151 +; CHECK-RV64-NEXT: j .LBB61_669 +; CHECK-RV64-NEXT: .LBB61_151: # %else582 +; CHECK-RV64-NEXT: slli a1, a2, 44 +; CHECK-RV64-NEXT: bgez a1, .LBB61_152 +; CHECK-RV64-NEXT: j .LBB61_670 +; CHECK-RV64-NEXT: .LBB61_152: # %else586 +; CHECK-RV64-NEXT: slli a1, a2, 43 +; CHECK-RV64-NEXT: bgez a1, .LBB61_153 +; CHECK-RV64-NEXT: j .LBB61_671 +; CHECK-RV64-NEXT: .LBB61_153: # %else590 +; CHECK-RV64-NEXT: slli a1, a2, 42 +; CHECK-RV64-NEXT: bgez a1, .LBB61_154 +; CHECK-RV64-NEXT: j .LBB61_672 +; CHECK-RV64-NEXT: .LBB61_154: # %else594 +; CHECK-RV64-NEXT: slli a1, a2, 41 +; CHECK-RV64-NEXT: bgez a1, .LBB61_155 +; CHECK-RV64-NEXT: j .LBB61_673 +; CHECK-RV64-NEXT: .LBB61_155: # %else598 +; CHECK-RV64-NEXT: slli a1, a2, 40 +; CHECK-RV64-NEXT: bgez a1, .LBB61_156 +; CHECK-RV64-NEXT: j .LBB61_674 +; CHECK-RV64-NEXT: .LBB61_156: # %else602 +; CHECK-RV64-NEXT: slli a1, a2, 39 +; CHECK-RV64-NEXT: bgez a1, .LBB61_157 +; CHECK-RV64-NEXT: j .LBB61_675 +; CHECK-RV64-NEXT: .LBB61_157: # %else606 +; CHECK-RV64-NEXT: slli a1, a2, 38 +; CHECK-RV64-NEXT: bgez a1, .LBB61_158 +; CHECK-RV64-NEXT: j .LBB61_676 +; CHECK-RV64-NEXT: .LBB61_158: # %else610 +; CHECK-RV64-NEXT: slli a1, a2, 37 +; CHECK-RV64-NEXT: bgez a1, .LBB61_159 +; CHECK-RV64-NEXT: j .LBB61_677 +; CHECK-RV64-NEXT: .LBB61_159: # %else614 +; CHECK-RV64-NEXT: slli a1, a2, 36 +; CHECK-RV64-NEXT: bgez a1, .LBB61_160 +; CHECK-RV64-NEXT: j .LBB61_678 +; CHECK-RV64-NEXT: .LBB61_160: # %else618 +; CHECK-RV64-NEXT: slli a1, a2, 35 +; CHECK-RV64-NEXT: bgez a1, .LBB61_161 +; CHECK-RV64-NEXT: j .LBB61_679 +; CHECK-RV64-NEXT: .LBB61_161: # %else622 +; CHECK-RV64-NEXT: slli a1, a2, 34 +; CHECK-RV64-NEXT: bgez a1, .LBB61_162 +; CHECK-RV64-NEXT: j .LBB61_680 +; CHECK-RV64-NEXT: .LBB61_162: # %else626 +; CHECK-RV64-NEXT: slli a1, a2, 33 +; CHECK-RV64-NEXT: bgez a1, .LBB61_163 +; CHECK-RV64-NEXT: j .LBB61_681 +; CHECK-RV64-NEXT: .LBB61_163: # %else630 +; CHECK-RV64-NEXT: slli a1, a2, 32 +; CHECK-RV64-NEXT: bgez a1, .LBB61_164 +; CHECK-RV64-NEXT: j .LBB61_682 +; CHECK-RV64-NEXT: .LBB61_164: # %else634 +; CHECK-RV64-NEXT: slli a1, a2, 31 +; CHECK-RV64-NEXT: bgez a1, .LBB61_165 +; CHECK-RV64-NEXT: j .LBB61_683 +; CHECK-RV64-NEXT: .LBB61_165: # %else638 +; CHECK-RV64-NEXT: slli a1, a2, 30 +; CHECK-RV64-NEXT: bgez a1, .LBB61_166 +; CHECK-RV64-NEXT: j .LBB61_684 +; CHECK-RV64-NEXT: .LBB61_166: # %else642 +; CHECK-RV64-NEXT: slli a1, a2, 29 +; CHECK-RV64-NEXT: bgez a1, .LBB61_167 +; CHECK-RV64-NEXT: j .LBB61_685 +; CHECK-RV64-NEXT: .LBB61_167: # %else646 +; CHECK-RV64-NEXT: slli a1, a2, 28 +; CHECK-RV64-NEXT: bgez a1, .LBB61_168 +; CHECK-RV64-NEXT: j .LBB61_686 +; CHECK-RV64-NEXT: .LBB61_168: # %else650 +; CHECK-RV64-NEXT: slli a1, a2, 27 +; CHECK-RV64-NEXT: bgez a1, .LBB61_169 +; CHECK-RV64-NEXT: j .LBB61_687 +; CHECK-RV64-NEXT: .LBB61_169: # %else654 +; CHECK-RV64-NEXT: slli a1, a2, 26 +; CHECK-RV64-NEXT: bgez a1, .LBB61_170 +; CHECK-RV64-NEXT: j .LBB61_688 +; CHECK-RV64-NEXT: .LBB61_170: # %else658 +; CHECK-RV64-NEXT: slli a1, a2, 25 +; CHECK-RV64-NEXT: bgez a1, .LBB61_171 +; CHECK-RV64-NEXT: j .LBB61_689 +; CHECK-RV64-NEXT: .LBB61_171: # %else662 +; CHECK-RV64-NEXT: slli a1, a2, 24 +; CHECK-RV64-NEXT: bgez a1, .LBB61_172 +; CHECK-RV64-NEXT: j .LBB61_690 +; CHECK-RV64-NEXT: .LBB61_172: # %else666 +; CHECK-RV64-NEXT: slli a1, a2, 23 +; CHECK-RV64-NEXT: bgez a1, .LBB61_173 +; CHECK-RV64-NEXT: j .LBB61_691 +; CHECK-RV64-NEXT: .LBB61_173: # %else670 +; CHECK-RV64-NEXT: slli a1, a2, 22 +; CHECK-RV64-NEXT: bgez a1, .LBB61_174 +; CHECK-RV64-NEXT: j .LBB61_692 +; CHECK-RV64-NEXT: .LBB61_174: # %else674 +; CHECK-RV64-NEXT: slli a1, a2, 21 +; CHECK-RV64-NEXT: bgez a1, .LBB61_175 +; CHECK-RV64-NEXT: j .LBB61_693 +; CHECK-RV64-NEXT: .LBB61_175: # %else678 +; CHECK-RV64-NEXT: slli a1, a2, 20 +; CHECK-RV64-NEXT: bgez a1, .LBB61_176 +; CHECK-RV64-NEXT: j .LBB61_694 +; CHECK-RV64-NEXT: .LBB61_176: # %else682 +; CHECK-RV64-NEXT: slli a1, a2, 19 +; CHECK-RV64-NEXT: bgez a1, .LBB61_177 +; CHECK-RV64-NEXT: j .LBB61_695 +; CHECK-RV64-NEXT: .LBB61_177: # %else686 +; CHECK-RV64-NEXT: slli a1, a2, 18 +; CHECK-RV64-NEXT: bgez a1, .LBB61_178 +; CHECK-RV64-NEXT: j .LBB61_696 +; CHECK-RV64-NEXT: .LBB61_178: # %else690 +; CHECK-RV64-NEXT: slli a1, a2, 17 +; CHECK-RV64-NEXT: bgez a1, .LBB61_179 +; CHECK-RV64-NEXT: j .LBB61_697 +; CHECK-RV64-NEXT: .LBB61_179: # %else694 +; CHECK-RV64-NEXT: slli a1, a2, 16 +; CHECK-RV64-NEXT: bgez a1, .LBB61_180 +; CHECK-RV64-NEXT: j .LBB61_698 +; CHECK-RV64-NEXT: .LBB61_180: # %else698 +; CHECK-RV64-NEXT: slli a1, a2, 15 +; CHECK-RV64-NEXT: bgez a1, .LBB61_181 +; CHECK-RV64-NEXT: j .LBB61_699 +; CHECK-RV64-NEXT: .LBB61_181: # %else702 +; CHECK-RV64-NEXT: slli a1, a2, 14 +; CHECK-RV64-NEXT: bgez a1, .LBB61_182 +; CHECK-RV64-NEXT: j .LBB61_700 +; CHECK-RV64-NEXT: .LBB61_182: # %else706 +; CHECK-RV64-NEXT: slli a1, a2, 13 +; CHECK-RV64-NEXT: bgez a1, .LBB61_183 +; CHECK-RV64-NEXT: j .LBB61_701 +; CHECK-RV64-NEXT: .LBB61_183: # %else710 +; CHECK-RV64-NEXT: slli a1, a2, 12 +; CHECK-RV64-NEXT: bgez a1, .LBB61_184 +; CHECK-RV64-NEXT: j .LBB61_702 +; CHECK-RV64-NEXT: .LBB61_184: # %else714 +; CHECK-RV64-NEXT: slli a1, a2, 11 +; CHECK-RV64-NEXT: bgez a1, .LBB61_185 +; CHECK-RV64-NEXT: j .LBB61_703 +; CHECK-RV64-NEXT: .LBB61_185: # %else718 +; CHECK-RV64-NEXT: slli a1, a2, 10 +; CHECK-RV64-NEXT: bgez a1, .LBB61_186 +; CHECK-RV64-NEXT: j .LBB61_704 +; CHECK-RV64-NEXT: .LBB61_186: # %else722 +; CHECK-RV64-NEXT: slli a1, a2, 9 +; CHECK-RV64-NEXT: bgez a1, .LBB61_187 +; CHECK-RV64-NEXT: j .LBB61_705 +; CHECK-RV64-NEXT: .LBB61_187: # %else726 +; CHECK-RV64-NEXT: slli a1, a2, 8 +; CHECK-RV64-NEXT: bgez a1, .LBB61_188 +; CHECK-RV64-NEXT: j .LBB61_706 +; CHECK-RV64-NEXT: .LBB61_188: # %else730 +; CHECK-RV64-NEXT: slli a1, a2, 7 +; CHECK-RV64-NEXT: bgez a1, .LBB61_189 +; CHECK-RV64-NEXT: j .LBB61_707 +; CHECK-RV64-NEXT: .LBB61_189: # %else734 +; CHECK-RV64-NEXT: slli a1, a2, 6 +; CHECK-RV64-NEXT: bgez a1, .LBB61_190 +; CHECK-RV64-NEXT: j .LBB61_708 +; CHECK-RV64-NEXT: .LBB61_190: # %else738 +; CHECK-RV64-NEXT: slli a1, a2, 5 +; CHECK-RV64-NEXT: bgez a1, .LBB61_191 +; CHECK-RV64-NEXT: j .LBB61_709 +; CHECK-RV64-NEXT: .LBB61_191: # %else742 +; CHECK-RV64-NEXT: slli a1, a2, 4 +; CHECK-RV64-NEXT: bgez a1, .LBB61_192 +; CHECK-RV64-NEXT: j .LBB61_710 +; CHECK-RV64-NEXT: .LBB61_192: # %else746 +; CHECK-RV64-NEXT: slli a1, a2, 3 +; CHECK-RV64-NEXT: bgez a1, .LBB61_193 +; CHECK-RV64-NEXT: j .LBB61_711 +; CHECK-RV64-NEXT: .LBB61_193: # %else750 +; CHECK-RV64-NEXT: slli a1, a2, 2 +; CHECK-RV64-NEXT: bgez a1, .LBB61_195 +; CHECK-RV64-NEXT: .LBB61_194: # %cond.load753 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 190 +; CHECK-RV64-NEXT: li a3, 189 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: .LBB61_195: # %else754 +; CHECK-RV64-NEXT: slli a1, a2, 1 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 3 +; CHECK-RV64-NEXT: bgez a1, .LBB61_197 +; CHECK-RV64-NEXT: # %bb.196: # %cond.load757 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v20, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 191 +; CHECK-RV64-NEXT: li a3, 190 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v20, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: .LBB61_197: # %else758 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a1, v16 +; CHECK-RV64-NEXT: bgez a2, .LBB61_198 +; CHECK-RV64-NEXT: j .LBB61_712 +; CHECK-RV64-NEXT: .LBB61_198: # %else762 +; CHECK-RV64-NEXT: andi a2, a1, 1 +; CHECK-RV64-NEXT: beqz a2, .LBB61_199 +; CHECK-RV64-NEXT: j .LBB61_713 +; CHECK-RV64-NEXT: .LBB61_199: # %else766 +; CHECK-RV64-NEXT: andi a2, a1, 2 +; CHECK-RV64-NEXT: beqz a2, .LBB61_200 +; CHECK-RV64-NEXT: j .LBB61_714 +; CHECK-RV64-NEXT: .LBB61_200: # %else770 +; CHECK-RV64-NEXT: andi a2, a1, 4 +; CHECK-RV64-NEXT: beqz a2, .LBB61_201 +; CHECK-RV64-NEXT: j .LBB61_715 +; CHECK-RV64-NEXT: .LBB61_201: # %else774 +; CHECK-RV64-NEXT: andi a2, a1, 8 +; CHECK-RV64-NEXT: beqz a2, .LBB61_202 +; CHECK-RV64-NEXT: j .LBB61_716 +; CHECK-RV64-NEXT: .LBB61_202: # %else778 +; CHECK-RV64-NEXT: andi a2, a1, 16 +; CHECK-RV64-NEXT: beqz a2, .LBB61_203 +; CHECK-RV64-NEXT: j .LBB61_717 +; CHECK-RV64-NEXT: .LBB61_203: # %else782 +; CHECK-RV64-NEXT: andi a2, a1, 32 +; CHECK-RV64-NEXT: beqz a2, .LBB61_204 +; CHECK-RV64-NEXT: j .LBB61_718 +; CHECK-RV64-NEXT: .LBB61_204: # %else786 +; CHECK-RV64-NEXT: andi a2, a1, 64 +; CHECK-RV64-NEXT: beqz a2, .LBB61_205 +; CHECK-RV64-NEXT: j .LBB61_719 +; CHECK-RV64-NEXT: .LBB61_205: # %else790 +; CHECK-RV64-NEXT: andi a2, a1, 128 +; CHECK-RV64-NEXT: beqz a2, .LBB61_206 +; CHECK-RV64-NEXT: j .LBB61_720 +; CHECK-RV64-NEXT: .LBB61_206: # %else794 +; CHECK-RV64-NEXT: andi a2, a1, 256 +; CHECK-RV64-NEXT: beqz a2, .LBB61_207 +; CHECK-RV64-NEXT: j .LBB61_721 +; CHECK-RV64-NEXT: .LBB61_207: # %else798 +; CHECK-RV64-NEXT: andi a2, a1, 512 +; CHECK-RV64-NEXT: beqz a2, .LBB61_208 +; CHECK-RV64-NEXT: j .LBB61_722 +; CHECK-RV64-NEXT: .LBB61_208: # %else802 +; CHECK-RV64-NEXT: andi a2, a1, 1024 +; CHECK-RV64-NEXT: beqz a2, .LBB61_209 +; CHECK-RV64-NEXT: j .LBB61_723 +; CHECK-RV64-NEXT: .LBB61_209: # %else806 +; CHECK-RV64-NEXT: slli a2, a1, 52 +; CHECK-RV64-NEXT: bgez a2, .LBB61_210 +; CHECK-RV64-NEXT: j .LBB61_724 +; CHECK-RV64-NEXT: .LBB61_210: # %else810 +; CHECK-RV64-NEXT: slli a2, a1, 51 +; CHECK-RV64-NEXT: bgez a2, .LBB61_211 +; CHECK-RV64-NEXT: j .LBB61_725 +; CHECK-RV64-NEXT: .LBB61_211: # %else814 +; CHECK-RV64-NEXT: slli a2, a1, 50 +; CHECK-RV64-NEXT: bgez a2, .LBB61_212 +; CHECK-RV64-NEXT: j .LBB61_726 +; CHECK-RV64-NEXT: .LBB61_212: # %else818 +; CHECK-RV64-NEXT: slli a2, a1, 49 +; CHECK-RV64-NEXT: bgez a2, .LBB61_213 +; CHECK-RV64-NEXT: j .LBB61_727 +; CHECK-RV64-NEXT: .LBB61_213: # %else822 +; CHECK-RV64-NEXT: slli a2, a1, 48 +; CHECK-RV64-NEXT: bgez a2, .LBB61_214 +; CHECK-RV64-NEXT: j .LBB61_728 +; CHECK-RV64-NEXT: .LBB61_214: # %else826 +; CHECK-RV64-NEXT: slli a2, a1, 47 +; CHECK-RV64-NEXT: bgez a2, .LBB61_215 +; CHECK-RV64-NEXT: j .LBB61_729 +; CHECK-RV64-NEXT: .LBB61_215: # %else830 +; CHECK-RV64-NEXT: slli a2, a1, 46 +; CHECK-RV64-NEXT: bgez a2, .LBB61_216 +; CHECK-RV64-NEXT: j .LBB61_730 +; CHECK-RV64-NEXT: .LBB61_216: # %else834 +; CHECK-RV64-NEXT: slli a2, a1, 45 +; CHECK-RV64-NEXT: bgez a2, .LBB61_217 +; CHECK-RV64-NEXT: j .LBB61_731 +; CHECK-RV64-NEXT: .LBB61_217: # %else838 +; CHECK-RV64-NEXT: slli a2, a1, 44 +; CHECK-RV64-NEXT: bgez a2, .LBB61_218 +; CHECK-RV64-NEXT: j .LBB61_732 +; CHECK-RV64-NEXT: .LBB61_218: # %else842 +; CHECK-RV64-NEXT: slli a2, a1, 43 +; CHECK-RV64-NEXT: bgez a2, .LBB61_219 +; CHECK-RV64-NEXT: j .LBB61_733 +; CHECK-RV64-NEXT: .LBB61_219: # %else846 +; CHECK-RV64-NEXT: slli a2, a1, 42 +; CHECK-RV64-NEXT: bgez a2, .LBB61_220 +; CHECK-RV64-NEXT: j .LBB61_734 +; CHECK-RV64-NEXT: .LBB61_220: # %else850 +; CHECK-RV64-NEXT: slli a2, a1, 41 +; CHECK-RV64-NEXT: bgez a2, .LBB61_221 +; CHECK-RV64-NEXT: j .LBB61_735 +; CHECK-RV64-NEXT: .LBB61_221: # %else854 +; CHECK-RV64-NEXT: slli a2, a1, 40 +; CHECK-RV64-NEXT: bgez a2, .LBB61_222 +; CHECK-RV64-NEXT: j .LBB61_736 +; CHECK-RV64-NEXT: .LBB61_222: # %else858 +; CHECK-RV64-NEXT: slli a2, a1, 39 +; CHECK-RV64-NEXT: bgez a2, .LBB61_223 +; CHECK-RV64-NEXT: j .LBB61_737 +; CHECK-RV64-NEXT: .LBB61_223: # %else862 +; CHECK-RV64-NEXT: slli a2, a1, 38 +; CHECK-RV64-NEXT: bgez a2, .LBB61_224 +; CHECK-RV64-NEXT: j .LBB61_738 +; CHECK-RV64-NEXT: .LBB61_224: # %else866 +; CHECK-RV64-NEXT: slli a2, a1, 37 +; CHECK-RV64-NEXT: bgez a2, .LBB61_225 +; CHECK-RV64-NEXT: j .LBB61_739 +; CHECK-RV64-NEXT: .LBB61_225: # %else870 +; CHECK-RV64-NEXT: slli a2, a1, 36 +; CHECK-RV64-NEXT: bgez a2, .LBB61_226 +; CHECK-RV64-NEXT: j .LBB61_740 +; CHECK-RV64-NEXT: .LBB61_226: # %else874 +; CHECK-RV64-NEXT: slli a2, a1, 35 +; CHECK-RV64-NEXT: bgez a2, .LBB61_227 +; CHECK-RV64-NEXT: j .LBB61_741 +; CHECK-RV64-NEXT: .LBB61_227: # %else878 +; CHECK-RV64-NEXT: slli a2, a1, 34 +; CHECK-RV64-NEXT: bgez a2, .LBB61_228 +; CHECK-RV64-NEXT: j .LBB61_742 +; CHECK-RV64-NEXT: .LBB61_228: # %else882 +; CHECK-RV64-NEXT: slli a2, a1, 33 +; CHECK-RV64-NEXT: bgez a2, .LBB61_229 +; CHECK-RV64-NEXT: j .LBB61_743 +; CHECK-RV64-NEXT: .LBB61_229: # %else886 +; CHECK-RV64-NEXT: slli a2, a1, 32 +; CHECK-RV64-NEXT: bgez a2, .LBB61_230 +; CHECK-RV64-NEXT: j .LBB61_744 +; CHECK-RV64-NEXT: .LBB61_230: # %else890 +; CHECK-RV64-NEXT: slli a2, a1, 31 +; CHECK-RV64-NEXT: bgez a2, .LBB61_231 +; CHECK-RV64-NEXT: j .LBB61_745 +; CHECK-RV64-NEXT: .LBB61_231: # %else894 +; CHECK-RV64-NEXT: slli a2, a1, 30 +; CHECK-RV64-NEXT: bgez a2, .LBB61_232 +; CHECK-RV64-NEXT: j .LBB61_746 +; CHECK-RV64-NEXT: .LBB61_232: # %else898 +; CHECK-RV64-NEXT: slli a2, a1, 29 +; CHECK-RV64-NEXT: bgez a2, .LBB61_233 +; CHECK-RV64-NEXT: j .LBB61_747 +; CHECK-RV64-NEXT: .LBB61_233: # %else902 +; CHECK-RV64-NEXT: slli a2, a1, 28 +; CHECK-RV64-NEXT: bgez a2, .LBB61_234 +; CHECK-RV64-NEXT: j .LBB61_748 +; CHECK-RV64-NEXT: .LBB61_234: # %else906 +; CHECK-RV64-NEXT: slli a2, a1, 27 +; CHECK-RV64-NEXT: bgez a2, .LBB61_235 +; CHECK-RV64-NEXT: j .LBB61_749 +; CHECK-RV64-NEXT: .LBB61_235: # %else910 +; CHECK-RV64-NEXT: slli a2, a1, 26 +; CHECK-RV64-NEXT: bgez a2, .LBB61_236 +; CHECK-RV64-NEXT: j .LBB61_750 +; CHECK-RV64-NEXT: .LBB61_236: # %else914 +; CHECK-RV64-NEXT: slli a2, a1, 25 +; CHECK-RV64-NEXT: bgez a2, .LBB61_237 +; CHECK-RV64-NEXT: j .LBB61_751 +; CHECK-RV64-NEXT: .LBB61_237: # %else918 +; CHECK-RV64-NEXT: slli a2, a1, 24 +; CHECK-RV64-NEXT: bgez a2, .LBB61_238 +; CHECK-RV64-NEXT: j .LBB61_752 +; CHECK-RV64-NEXT: .LBB61_238: # %else922 +; CHECK-RV64-NEXT: slli a2, a1, 23 +; CHECK-RV64-NEXT: bgez a2, .LBB61_239 +; CHECK-RV64-NEXT: j .LBB61_753 +; CHECK-RV64-NEXT: .LBB61_239: # %else926 +; CHECK-RV64-NEXT: slli a2, a1, 22 +; CHECK-RV64-NEXT: bgez a2, .LBB61_240 +; CHECK-RV64-NEXT: j .LBB61_754 +; CHECK-RV64-NEXT: .LBB61_240: # %else930 +; CHECK-RV64-NEXT: slli a2, a1, 21 +; CHECK-RV64-NEXT: bgez a2, .LBB61_241 +; CHECK-RV64-NEXT: j .LBB61_755 +; CHECK-RV64-NEXT: .LBB61_241: # %else934 +; CHECK-RV64-NEXT: slli a2, a1, 20 +; CHECK-RV64-NEXT: bgez a2, .LBB61_242 +; CHECK-RV64-NEXT: j .LBB61_756 +; CHECK-RV64-NEXT: .LBB61_242: # %else938 +; CHECK-RV64-NEXT: slli a2, a1, 19 +; CHECK-RV64-NEXT: bgez a2, .LBB61_243 +; CHECK-RV64-NEXT: j .LBB61_757 +; CHECK-RV64-NEXT: .LBB61_243: # %else942 +; CHECK-RV64-NEXT: slli a2, a1, 18 +; CHECK-RV64-NEXT: bgez a2, .LBB61_244 +; CHECK-RV64-NEXT: j .LBB61_758 +; CHECK-RV64-NEXT: .LBB61_244: # %else946 +; CHECK-RV64-NEXT: slli a2, a1, 17 +; CHECK-RV64-NEXT: bgez a2, .LBB61_245 +; CHECK-RV64-NEXT: j .LBB61_759 +; CHECK-RV64-NEXT: .LBB61_245: # %else950 +; CHECK-RV64-NEXT: slli a2, a1, 16 +; CHECK-RV64-NEXT: bgez a2, .LBB61_246 +; CHECK-RV64-NEXT: j .LBB61_760 +; CHECK-RV64-NEXT: .LBB61_246: # %else954 +; CHECK-RV64-NEXT: slli a2, a1, 15 +; CHECK-RV64-NEXT: bgez a2, .LBB61_247 +; CHECK-RV64-NEXT: j .LBB61_761 +; CHECK-RV64-NEXT: .LBB61_247: # %else958 +; CHECK-RV64-NEXT: slli a2, a1, 14 +; CHECK-RV64-NEXT: bgez a2, .LBB61_248 +; CHECK-RV64-NEXT: j .LBB61_762 +; CHECK-RV64-NEXT: .LBB61_248: # %else962 +; CHECK-RV64-NEXT: slli a2, a1, 13 +; CHECK-RV64-NEXT: bgez a2, .LBB61_249 +; CHECK-RV64-NEXT: j .LBB61_763 +; CHECK-RV64-NEXT: .LBB61_249: # %else966 +; CHECK-RV64-NEXT: slli a2, a1, 12 +; CHECK-RV64-NEXT: bgez a2, .LBB61_250 +; CHECK-RV64-NEXT: j .LBB61_764 +; CHECK-RV64-NEXT: .LBB61_250: # %else970 +; CHECK-RV64-NEXT: slli a2, a1, 11 +; CHECK-RV64-NEXT: bgez a2, .LBB61_251 +; CHECK-RV64-NEXT: j .LBB61_765 +; CHECK-RV64-NEXT: .LBB61_251: # %else974 +; CHECK-RV64-NEXT: slli a2, a1, 10 +; CHECK-RV64-NEXT: bgez a2, .LBB61_252 +; CHECK-RV64-NEXT: j .LBB61_766 +; CHECK-RV64-NEXT: .LBB61_252: # %else978 +; CHECK-RV64-NEXT: slli a2, a1, 9 +; CHECK-RV64-NEXT: bgez a2, .LBB61_253 +; CHECK-RV64-NEXT: j .LBB61_767 +; CHECK-RV64-NEXT: .LBB61_253: # %else982 +; CHECK-RV64-NEXT: slli a2, a1, 8 +; CHECK-RV64-NEXT: bgez a2, .LBB61_254 +; CHECK-RV64-NEXT: j .LBB61_768 +; CHECK-RV64-NEXT: .LBB61_254: # %else986 +; CHECK-RV64-NEXT: slli a2, a1, 7 +; CHECK-RV64-NEXT: bgez a2, .LBB61_255 +; CHECK-RV64-NEXT: j .LBB61_769 +; CHECK-RV64-NEXT: .LBB61_255: # %else990 +; CHECK-RV64-NEXT: slli a2, a1, 6 +; CHECK-RV64-NEXT: bgez a2, .LBB61_256 +; CHECK-RV64-NEXT: j .LBB61_770 +; CHECK-RV64-NEXT: .LBB61_256: # %else994 +; CHECK-RV64-NEXT: slli a2, a1, 5 +; CHECK-RV64-NEXT: bgez a2, .LBB61_257 +; CHECK-RV64-NEXT: j .LBB61_771 +; CHECK-RV64-NEXT: .LBB61_257: # %else998 +; CHECK-RV64-NEXT: slli a2, a1, 4 +; CHECK-RV64-NEXT: bgez a2, .LBB61_258 +; CHECK-RV64-NEXT: j .LBB61_772 +; CHECK-RV64-NEXT: .LBB61_258: # %else1002 +; CHECK-RV64-NEXT: slli a2, a1, 3 +; CHECK-RV64-NEXT: bgez a2, .LBB61_259 +; CHECK-RV64-NEXT: j .LBB61_773 +; CHECK-RV64-NEXT: .LBB61_259: # %else1006 +; CHECK-RV64-NEXT: slli a2, a1, 2 +; CHECK-RV64-NEXT: bgez a2, .LBB61_261 +; CHECK-RV64-NEXT: .LBB61_260: # %cond.load1009 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 254 +; CHECK-RV64-NEXT: li a3, 253 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: .LBB61_261: # %else1010 +; CHECK-RV64-NEXT: slli a2, a1, 1 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 4 +; CHECK-RV64-NEXT: bgez a2, .LBB61_263 +; CHECK-RV64-NEXT: # %bb.262: # %cond.load1013 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v20, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 255 +; CHECK-RV64-NEXT: li a3, 254 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v20, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: .LBB61_263: # %else1014 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a2, v16 +; CHECK-RV64-NEXT: bgez a1, .LBB61_264 +; CHECK-RV64-NEXT: j .LBB61_774 +; CHECK-RV64-NEXT: .LBB61_264: # %else1018 +; CHECK-RV64-NEXT: andi a1, a2, 1 +; CHECK-RV64-NEXT: beqz a1, .LBB61_265 +; CHECK-RV64-NEXT: j .LBB61_775 +; CHECK-RV64-NEXT: .LBB61_265: # %else1022 +; CHECK-RV64-NEXT: andi a1, a2, 2 +; CHECK-RV64-NEXT: beqz a1, .LBB61_266 +; CHECK-RV64-NEXT: j .LBB61_776 +; CHECK-RV64-NEXT: .LBB61_266: # %else1026 +; CHECK-RV64-NEXT: andi a1, a2, 4 +; CHECK-RV64-NEXT: beqz a1, .LBB61_267 +; CHECK-RV64-NEXT: j .LBB61_777 +; CHECK-RV64-NEXT: .LBB61_267: # %else1030 +; CHECK-RV64-NEXT: andi a1, a2, 8 +; CHECK-RV64-NEXT: beqz a1, .LBB61_268 +; CHECK-RV64-NEXT: j .LBB61_778 +; CHECK-RV64-NEXT: .LBB61_268: # %else1034 +; CHECK-RV64-NEXT: andi a1, a2, 16 +; CHECK-RV64-NEXT: beqz a1, .LBB61_269 +; CHECK-RV64-NEXT: j .LBB61_779 +; CHECK-RV64-NEXT: .LBB61_269: # %else1038 +; CHECK-RV64-NEXT: andi a1, a2, 32 +; CHECK-RV64-NEXT: beqz a1, .LBB61_270 +; CHECK-RV64-NEXT: j .LBB61_780 +; CHECK-RV64-NEXT: .LBB61_270: # %else1042 +; CHECK-RV64-NEXT: andi a1, a2, 64 +; CHECK-RV64-NEXT: beqz a1, .LBB61_271 +; CHECK-RV64-NEXT: j .LBB61_781 +; CHECK-RV64-NEXT: .LBB61_271: # %else1046 +; CHECK-RV64-NEXT: andi a1, a2, 128 +; CHECK-RV64-NEXT: beqz a1, .LBB61_272 +; CHECK-RV64-NEXT: j .LBB61_782 +; CHECK-RV64-NEXT: .LBB61_272: # %else1050 +; CHECK-RV64-NEXT: andi a1, a2, 256 +; CHECK-RV64-NEXT: beqz a1, .LBB61_273 +; CHECK-RV64-NEXT: j .LBB61_783 +; CHECK-RV64-NEXT: .LBB61_273: # %else1054 +; CHECK-RV64-NEXT: andi a1, a2, 512 +; CHECK-RV64-NEXT: beqz a1, .LBB61_274 +; CHECK-RV64-NEXT: j .LBB61_784 +; CHECK-RV64-NEXT: .LBB61_274: # %else1058 +; CHECK-RV64-NEXT: andi a1, a2, 1024 +; CHECK-RV64-NEXT: beqz a1, .LBB61_275 +; CHECK-RV64-NEXT: j .LBB61_785 +; CHECK-RV64-NEXT: .LBB61_275: # %else1062 +; CHECK-RV64-NEXT: slli a1, a2, 52 +; CHECK-RV64-NEXT: bgez a1, .LBB61_276 +; CHECK-RV64-NEXT: j .LBB61_786 +; CHECK-RV64-NEXT: .LBB61_276: # %else1066 +; CHECK-RV64-NEXT: slli a1, a2, 51 +; CHECK-RV64-NEXT: bgez a1, .LBB61_277 +; CHECK-RV64-NEXT: j .LBB61_787 +; CHECK-RV64-NEXT: .LBB61_277: # %else1070 +; CHECK-RV64-NEXT: slli a1, a2, 50 +; CHECK-RV64-NEXT: bgez a1, .LBB61_278 +; CHECK-RV64-NEXT: j .LBB61_788 +; CHECK-RV64-NEXT: .LBB61_278: # %else1074 +; CHECK-RV64-NEXT: slli a1, a2, 49 +; CHECK-RV64-NEXT: bgez a1, .LBB61_279 +; CHECK-RV64-NEXT: j .LBB61_789 +; CHECK-RV64-NEXT: .LBB61_279: # %else1078 +; CHECK-RV64-NEXT: slli a1, a2, 48 +; CHECK-RV64-NEXT: bgez a1, .LBB61_280 +; CHECK-RV64-NEXT: j .LBB61_790 +; CHECK-RV64-NEXT: .LBB61_280: # %else1082 +; CHECK-RV64-NEXT: slli a1, a2, 47 +; CHECK-RV64-NEXT: bgez a1, .LBB61_281 +; CHECK-RV64-NEXT: j .LBB61_791 +; CHECK-RV64-NEXT: .LBB61_281: # %else1086 +; CHECK-RV64-NEXT: slli a1, a2, 46 +; CHECK-RV64-NEXT: bgez a1, .LBB61_282 +; CHECK-RV64-NEXT: j .LBB61_792 +; CHECK-RV64-NEXT: .LBB61_282: # %else1090 +; CHECK-RV64-NEXT: slli a1, a2, 45 +; CHECK-RV64-NEXT: bgez a1, .LBB61_283 +; CHECK-RV64-NEXT: j .LBB61_793 +; CHECK-RV64-NEXT: .LBB61_283: # %else1094 +; CHECK-RV64-NEXT: slli a1, a2, 44 +; CHECK-RV64-NEXT: bgez a1, .LBB61_284 +; CHECK-RV64-NEXT: j .LBB61_794 +; CHECK-RV64-NEXT: .LBB61_284: # %else1098 +; CHECK-RV64-NEXT: slli a1, a2, 43 +; CHECK-RV64-NEXT: bgez a1, .LBB61_285 +; CHECK-RV64-NEXT: j .LBB61_795 +; CHECK-RV64-NEXT: .LBB61_285: # %else1102 +; CHECK-RV64-NEXT: slli a1, a2, 42 +; CHECK-RV64-NEXT: bgez a1, .LBB61_286 +; CHECK-RV64-NEXT: j .LBB61_796 +; CHECK-RV64-NEXT: .LBB61_286: # %else1106 +; CHECK-RV64-NEXT: slli a1, a2, 41 +; CHECK-RV64-NEXT: bgez a1, .LBB61_287 +; CHECK-RV64-NEXT: j .LBB61_797 +; CHECK-RV64-NEXT: .LBB61_287: # %else1110 +; CHECK-RV64-NEXT: slli a1, a2, 40 +; CHECK-RV64-NEXT: bgez a1, .LBB61_288 +; CHECK-RV64-NEXT: j .LBB61_798 +; CHECK-RV64-NEXT: .LBB61_288: # %else1114 +; CHECK-RV64-NEXT: slli a1, a2, 39 +; CHECK-RV64-NEXT: bgez a1, .LBB61_289 +; CHECK-RV64-NEXT: j .LBB61_799 +; CHECK-RV64-NEXT: .LBB61_289: # %else1118 +; CHECK-RV64-NEXT: slli a1, a2, 38 +; CHECK-RV64-NEXT: bgez a1, .LBB61_290 +; CHECK-RV64-NEXT: j .LBB61_800 +; CHECK-RV64-NEXT: .LBB61_290: # %else1122 +; CHECK-RV64-NEXT: slli a1, a2, 37 +; CHECK-RV64-NEXT: bgez a1, .LBB61_291 +; CHECK-RV64-NEXT: j .LBB61_801 +; CHECK-RV64-NEXT: .LBB61_291: # %else1126 +; CHECK-RV64-NEXT: slli a1, a2, 36 +; CHECK-RV64-NEXT: bgez a1, .LBB61_292 +; CHECK-RV64-NEXT: j .LBB61_802 +; CHECK-RV64-NEXT: .LBB61_292: # %else1130 +; CHECK-RV64-NEXT: slli a1, a2, 35 +; CHECK-RV64-NEXT: bgez a1, .LBB61_293 +; CHECK-RV64-NEXT: j .LBB61_803 +; CHECK-RV64-NEXT: .LBB61_293: # %else1134 +; CHECK-RV64-NEXT: slli a1, a2, 34 +; CHECK-RV64-NEXT: bgez a1, .LBB61_294 +; CHECK-RV64-NEXT: j .LBB61_804 +; CHECK-RV64-NEXT: .LBB61_294: # %else1138 +; CHECK-RV64-NEXT: slli a1, a2, 33 +; CHECK-RV64-NEXT: bgez a1, .LBB61_295 +; CHECK-RV64-NEXT: j .LBB61_805 +; CHECK-RV64-NEXT: .LBB61_295: # %else1142 +; CHECK-RV64-NEXT: slli a1, a2, 32 +; CHECK-RV64-NEXT: bgez a1, .LBB61_296 +; CHECK-RV64-NEXT: j .LBB61_806 +; CHECK-RV64-NEXT: .LBB61_296: # %else1146 +; CHECK-RV64-NEXT: slli a1, a2, 31 +; CHECK-RV64-NEXT: bgez a1, .LBB61_297 +; CHECK-RV64-NEXT: j .LBB61_807 +; CHECK-RV64-NEXT: .LBB61_297: # %else1150 +; CHECK-RV64-NEXT: slli a1, a2, 30 +; CHECK-RV64-NEXT: bgez a1, .LBB61_298 +; CHECK-RV64-NEXT: j .LBB61_808 +; CHECK-RV64-NEXT: .LBB61_298: # %else1154 +; CHECK-RV64-NEXT: slli a1, a2, 29 +; CHECK-RV64-NEXT: bgez a1, .LBB61_299 +; CHECK-RV64-NEXT: j .LBB61_809 +; CHECK-RV64-NEXT: .LBB61_299: # %else1158 +; CHECK-RV64-NEXT: slli a1, a2, 28 +; CHECK-RV64-NEXT: bgez a1, .LBB61_300 +; CHECK-RV64-NEXT: j .LBB61_810 +; CHECK-RV64-NEXT: .LBB61_300: # %else1162 +; CHECK-RV64-NEXT: slli a1, a2, 27 +; CHECK-RV64-NEXT: bgez a1, .LBB61_301 +; CHECK-RV64-NEXT: j .LBB61_811 +; CHECK-RV64-NEXT: .LBB61_301: # %else1166 +; CHECK-RV64-NEXT: slli a1, a2, 26 +; CHECK-RV64-NEXT: bgez a1, .LBB61_302 +; CHECK-RV64-NEXT: j .LBB61_812 +; CHECK-RV64-NEXT: .LBB61_302: # %else1170 +; CHECK-RV64-NEXT: slli a1, a2, 25 +; CHECK-RV64-NEXT: bgez a1, .LBB61_303 +; CHECK-RV64-NEXT: j .LBB61_813 +; CHECK-RV64-NEXT: .LBB61_303: # %else1174 +; CHECK-RV64-NEXT: slli a1, a2, 24 +; CHECK-RV64-NEXT: bgez a1, .LBB61_304 +; CHECK-RV64-NEXT: j .LBB61_814 +; CHECK-RV64-NEXT: .LBB61_304: # %else1178 +; CHECK-RV64-NEXT: slli a1, a2, 23 +; CHECK-RV64-NEXT: bgez a1, .LBB61_305 +; CHECK-RV64-NEXT: j .LBB61_815 +; CHECK-RV64-NEXT: .LBB61_305: # %else1182 +; CHECK-RV64-NEXT: slli a1, a2, 22 +; CHECK-RV64-NEXT: bgez a1, .LBB61_306 +; CHECK-RV64-NEXT: j .LBB61_816 +; CHECK-RV64-NEXT: .LBB61_306: # %else1186 +; CHECK-RV64-NEXT: slli a1, a2, 21 +; CHECK-RV64-NEXT: bgez a1, .LBB61_307 +; CHECK-RV64-NEXT: j .LBB61_817 +; CHECK-RV64-NEXT: .LBB61_307: # %else1190 +; CHECK-RV64-NEXT: slli a1, a2, 20 +; CHECK-RV64-NEXT: bgez a1, .LBB61_308 +; CHECK-RV64-NEXT: j .LBB61_818 +; CHECK-RV64-NEXT: .LBB61_308: # %else1194 +; CHECK-RV64-NEXT: slli a1, a2, 19 +; CHECK-RV64-NEXT: bgez a1, .LBB61_309 +; CHECK-RV64-NEXT: j .LBB61_819 +; CHECK-RV64-NEXT: .LBB61_309: # %else1198 +; CHECK-RV64-NEXT: slli a1, a2, 18 +; CHECK-RV64-NEXT: bgez a1, .LBB61_310 +; CHECK-RV64-NEXT: j .LBB61_820 +; CHECK-RV64-NEXT: .LBB61_310: # %else1202 +; CHECK-RV64-NEXT: slli a1, a2, 17 +; CHECK-RV64-NEXT: bgez a1, .LBB61_311 +; CHECK-RV64-NEXT: j .LBB61_821 +; CHECK-RV64-NEXT: .LBB61_311: # %else1206 +; CHECK-RV64-NEXT: slli a1, a2, 16 +; CHECK-RV64-NEXT: bgez a1, .LBB61_312 +; CHECK-RV64-NEXT: j .LBB61_822 +; CHECK-RV64-NEXT: .LBB61_312: # %else1210 +; CHECK-RV64-NEXT: slli a1, a2, 15 +; CHECK-RV64-NEXT: bgez a1, .LBB61_313 +; CHECK-RV64-NEXT: j .LBB61_823 +; CHECK-RV64-NEXT: .LBB61_313: # %else1214 +; CHECK-RV64-NEXT: slli a1, a2, 14 +; CHECK-RV64-NEXT: bgez a1, .LBB61_314 +; CHECK-RV64-NEXT: j .LBB61_824 +; CHECK-RV64-NEXT: .LBB61_314: # %else1218 +; CHECK-RV64-NEXT: slli a1, a2, 13 +; CHECK-RV64-NEXT: bgez a1, .LBB61_315 +; CHECK-RV64-NEXT: j .LBB61_825 +; CHECK-RV64-NEXT: .LBB61_315: # %else1222 +; CHECK-RV64-NEXT: slli a1, a2, 12 +; CHECK-RV64-NEXT: bgez a1, .LBB61_316 +; CHECK-RV64-NEXT: j .LBB61_826 +; CHECK-RV64-NEXT: .LBB61_316: # %else1226 +; CHECK-RV64-NEXT: slli a1, a2, 11 +; CHECK-RV64-NEXT: bgez a1, .LBB61_317 +; CHECK-RV64-NEXT: j .LBB61_827 +; CHECK-RV64-NEXT: .LBB61_317: # %else1230 +; CHECK-RV64-NEXT: slli a1, a2, 10 +; CHECK-RV64-NEXT: bgez a1, .LBB61_318 +; CHECK-RV64-NEXT: j .LBB61_828 +; CHECK-RV64-NEXT: .LBB61_318: # %else1234 +; CHECK-RV64-NEXT: slli a1, a2, 9 +; CHECK-RV64-NEXT: bgez a1, .LBB61_319 +; CHECK-RV64-NEXT: j .LBB61_829 +; CHECK-RV64-NEXT: .LBB61_319: # %else1238 +; CHECK-RV64-NEXT: slli a1, a2, 8 +; CHECK-RV64-NEXT: bgez a1, .LBB61_320 +; CHECK-RV64-NEXT: j .LBB61_830 +; CHECK-RV64-NEXT: .LBB61_320: # %else1242 +; CHECK-RV64-NEXT: slli a1, a2, 7 +; CHECK-RV64-NEXT: bgez a1, .LBB61_321 +; CHECK-RV64-NEXT: j .LBB61_831 +; CHECK-RV64-NEXT: .LBB61_321: # %else1246 +; CHECK-RV64-NEXT: slli a1, a2, 6 +; CHECK-RV64-NEXT: bgez a1, .LBB61_322 +; CHECK-RV64-NEXT: j .LBB61_832 +; CHECK-RV64-NEXT: .LBB61_322: # %else1250 +; CHECK-RV64-NEXT: slli a1, a2, 5 +; CHECK-RV64-NEXT: bgez a1, .LBB61_323 +; CHECK-RV64-NEXT: j .LBB61_833 +; CHECK-RV64-NEXT: .LBB61_323: # %else1254 +; CHECK-RV64-NEXT: slli a1, a2, 4 +; CHECK-RV64-NEXT: bgez a1, .LBB61_324 +; CHECK-RV64-NEXT: j .LBB61_834 +; CHECK-RV64-NEXT: .LBB61_324: # %else1258 +; CHECK-RV64-NEXT: slli a1, a2, 3 +; CHECK-RV64-NEXT: bgez a1, .LBB61_325 +; CHECK-RV64-NEXT: j .LBB61_835 +; CHECK-RV64-NEXT: .LBB61_325: # %else1262 +; CHECK-RV64-NEXT: slli a1, a2, 2 +; CHECK-RV64-NEXT: bgez a1, .LBB61_327 +; CHECK-RV64-NEXT: .LBB61_326: # %cond.load1265 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 318 +; CHECK-RV64-NEXT: li a3, 317 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: .LBB61_327: # %else1266 +; CHECK-RV64-NEXT: slli a1, a2, 1 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 5 +; CHECK-RV64-NEXT: bgez a1, .LBB61_329 +; CHECK-RV64-NEXT: # %bb.328: # %cond.load1269 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: li a1, 319 +; CHECK-RV64-NEXT: li a3, 318 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: .LBB61_329: # %else1270 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a1, v16 +; CHECK-RV64-NEXT: bgez a2, .LBB61_330 +; CHECK-RV64-NEXT: j .LBB61_836 +; CHECK-RV64-NEXT: .LBB61_330: # %else1274 +; CHECK-RV64-NEXT: andi a2, a1, 1 +; CHECK-RV64-NEXT: beqz a2, .LBB61_331 +; CHECK-RV64-NEXT: j .LBB61_837 +; CHECK-RV64-NEXT: .LBB61_331: # %else1278 +; CHECK-RV64-NEXT: andi a2, a1, 2 +; CHECK-RV64-NEXT: beqz a2, .LBB61_332 +; CHECK-RV64-NEXT: j .LBB61_838 +; CHECK-RV64-NEXT: .LBB61_332: # %else1282 +; CHECK-RV64-NEXT: andi a2, a1, 4 +; CHECK-RV64-NEXT: beqz a2, .LBB61_333 +; CHECK-RV64-NEXT: j .LBB61_839 +; CHECK-RV64-NEXT: .LBB61_333: # %else1286 +; CHECK-RV64-NEXT: andi a2, a1, 8 +; CHECK-RV64-NEXT: beqz a2, .LBB61_334 +; CHECK-RV64-NEXT: j .LBB61_840 +; CHECK-RV64-NEXT: .LBB61_334: # %else1290 +; CHECK-RV64-NEXT: andi a2, a1, 16 +; CHECK-RV64-NEXT: beqz a2, .LBB61_335 +; CHECK-RV64-NEXT: j .LBB61_841 +; CHECK-RV64-NEXT: .LBB61_335: # %else1294 +; CHECK-RV64-NEXT: andi a2, a1, 32 +; CHECK-RV64-NEXT: beqz a2, .LBB61_336 +; CHECK-RV64-NEXT: j .LBB61_842 +; CHECK-RV64-NEXT: .LBB61_336: # %else1298 +; CHECK-RV64-NEXT: andi a2, a1, 64 +; CHECK-RV64-NEXT: beqz a2, .LBB61_337 +; CHECK-RV64-NEXT: j .LBB61_843 +; CHECK-RV64-NEXT: .LBB61_337: # %else1302 +; CHECK-RV64-NEXT: andi a2, a1, 128 +; CHECK-RV64-NEXT: beqz a2, .LBB61_338 +; CHECK-RV64-NEXT: j .LBB61_844 +; CHECK-RV64-NEXT: .LBB61_338: # %else1306 +; CHECK-RV64-NEXT: andi a2, a1, 256 +; CHECK-RV64-NEXT: beqz a2, .LBB61_339 +; CHECK-RV64-NEXT: j .LBB61_845 +; CHECK-RV64-NEXT: .LBB61_339: # %else1310 +; CHECK-RV64-NEXT: andi a2, a1, 512 +; CHECK-RV64-NEXT: beqz a2, .LBB61_340 +; CHECK-RV64-NEXT: j .LBB61_846 +; CHECK-RV64-NEXT: .LBB61_340: # %else1314 +; CHECK-RV64-NEXT: andi a2, a1, 1024 +; CHECK-RV64-NEXT: beqz a2, .LBB61_341 +; CHECK-RV64-NEXT: j .LBB61_847 +; CHECK-RV64-NEXT: .LBB61_341: # %else1318 +; CHECK-RV64-NEXT: slli a2, a1, 52 +; CHECK-RV64-NEXT: bgez a2, .LBB61_342 +; CHECK-RV64-NEXT: j .LBB61_848 +; CHECK-RV64-NEXT: .LBB61_342: # %else1322 +; CHECK-RV64-NEXT: slli a2, a1, 51 +; CHECK-RV64-NEXT: bgez a2, .LBB61_343 +; CHECK-RV64-NEXT: j .LBB61_849 +; CHECK-RV64-NEXT: .LBB61_343: # %else1326 +; CHECK-RV64-NEXT: slli a2, a1, 50 +; CHECK-RV64-NEXT: bgez a2, .LBB61_344 +; CHECK-RV64-NEXT: j .LBB61_850 +; CHECK-RV64-NEXT: .LBB61_344: # %else1330 +; CHECK-RV64-NEXT: slli a2, a1, 49 +; CHECK-RV64-NEXT: bgez a2, .LBB61_345 +; CHECK-RV64-NEXT: j .LBB61_851 +; CHECK-RV64-NEXT: .LBB61_345: # %else1334 +; CHECK-RV64-NEXT: slli a2, a1, 48 +; CHECK-RV64-NEXT: bgez a2, .LBB61_346 +; CHECK-RV64-NEXT: j .LBB61_852 +; CHECK-RV64-NEXT: .LBB61_346: # %else1338 +; CHECK-RV64-NEXT: slli a2, a1, 47 +; CHECK-RV64-NEXT: bgez a2, .LBB61_347 +; CHECK-RV64-NEXT: j .LBB61_853 +; CHECK-RV64-NEXT: .LBB61_347: # %else1342 +; CHECK-RV64-NEXT: slli a2, a1, 46 +; CHECK-RV64-NEXT: bgez a2, .LBB61_348 +; CHECK-RV64-NEXT: j .LBB61_854 +; CHECK-RV64-NEXT: .LBB61_348: # %else1346 +; CHECK-RV64-NEXT: slli a2, a1, 45 +; CHECK-RV64-NEXT: bgez a2, .LBB61_349 +; CHECK-RV64-NEXT: j .LBB61_855 +; CHECK-RV64-NEXT: .LBB61_349: # %else1350 +; CHECK-RV64-NEXT: slli a2, a1, 44 +; CHECK-RV64-NEXT: bgez a2, .LBB61_350 +; CHECK-RV64-NEXT: j .LBB61_856 +; CHECK-RV64-NEXT: .LBB61_350: # %else1354 +; CHECK-RV64-NEXT: slli a2, a1, 43 +; CHECK-RV64-NEXT: bgez a2, .LBB61_351 +; CHECK-RV64-NEXT: j .LBB61_857 +; CHECK-RV64-NEXT: .LBB61_351: # %else1358 +; CHECK-RV64-NEXT: slli a2, a1, 42 +; CHECK-RV64-NEXT: bgez a2, .LBB61_352 +; CHECK-RV64-NEXT: j .LBB61_858 +; CHECK-RV64-NEXT: .LBB61_352: # %else1362 +; CHECK-RV64-NEXT: slli a2, a1, 41 +; CHECK-RV64-NEXT: bgez a2, .LBB61_353 +; CHECK-RV64-NEXT: j .LBB61_859 +; CHECK-RV64-NEXT: .LBB61_353: # %else1366 +; CHECK-RV64-NEXT: slli a2, a1, 40 +; CHECK-RV64-NEXT: bgez a2, .LBB61_354 +; CHECK-RV64-NEXT: j .LBB61_860 +; CHECK-RV64-NEXT: .LBB61_354: # %else1370 +; CHECK-RV64-NEXT: slli a2, a1, 39 +; CHECK-RV64-NEXT: bgez a2, .LBB61_355 +; CHECK-RV64-NEXT: j .LBB61_861 +; CHECK-RV64-NEXT: .LBB61_355: # %else1374 +; CHECK-RV64-NEXT: slli a2, a1, 38 +; CHECK-RV64-NEXT: bgez a2, .LBB61_356 +; CHECK-RV64-NEXT: j .LBB61_862 +; CHECK-RV64-NEXT: .LBB61_356: # %else1378 +; CHECK-RV64-NEXT: slli a2, a1, 37 +; CHECK-RV64-NEXT: bgez a2, .LBB61_357 +; CHECK-RV64-NEXT: j .LBB61_863 +; CHECK-RV64-NEXT: .LBB61_357: # %else1382 +; CHECK-RV64-NEXT: slli a2, a1, 36 +; CHECK-RV64-NEXT: bgez a2, .LBB61_358 +; CHECK-RV64-NEXT: j .LBB61_864 +; CHECK-RV64-NEXT: .LBB61_358: # %else1386 +; CHECK-RV64-NEXT: slli a2, a1, 35 +; CHECK-RV64-NEXT: bgez a2, .LBB61_359 +; CHECK-RV64-NEXT: j .LBB61_865 +; CHECK-RV64-NEXT: .LBB61_359: # %else1390 +; CHECK-RV64-NEXT: slli a2, a1, 34 +; CHECK-RV64-NEXT: bgez a2, .LBB61_360 +; CHECK-RV64-NEXT: j .LBB61_866 +; CHECK-RV64-NEXT: .LBB61_360: # %else1394 +; CHECK-RV64-NEXT: slli a2, a1, 33 +; CHECK-RV64-NEXT: bgez a2, .LBB61_361 +; CHECK-RV64-NEXT: j .LBB61_867 +; CHECK-RV64-NEXT: .LBB61_361: # %else1398 +; CHECK-RV64-NEXT: slli a2, a1, 32 +; CHECK-RV64-NEXT: bgez a2, .LBB61_362 +; CHECK-RV64-NEXT: j .LBB61_868 +; CHECK-RV64-NEXT: .LBB61_362: # %else1402 +; CHECK-RV64-NEXT: slli a2, a1, 31 +; CHECK-RV64-NEXT: bgez a2, .LBB61_363 +; CHECK-RV64-NEXT: j .LBB61_869 +; CHECK-RV64-NEXT: .LBB61_363: # %else1406 +; CHECK-RV64-NEXT: slli a2, a1, 30 +; CHECK-RV64-NEXT: bgez a2, .LBB61_364 +; CHECK-RV64-NEXT: j .LBB61_870 +; CHECK-RV64-NEXT: .LBB61_364: # %else1410 +; CHECK-RV64-NEXT: slli a2, a1, 29 +; CHECK-RV64-NEXT: bgez a2, .LBB61_365 +; CHECK-RV64-NEXT: j .LBB61_871 +; CHECK-RV64-NEXT: .LBB61_365: # %else1414 +; CHECK-RV64-NEXT: slli a2, a1, 28 +; CHECK-RV64-NEXT: bgez a2, .LBB61_366 +; CHECK-RV64-NEXT: j .LBB61_872 +; CHECK-RV64-NEXT: .LBB61_366: # %else1418 +; CHECK-RV64-NEXT: slli a2, a1, 27 +; CHECK-RV64-NEXT: bgez a2, .LBB61_367 +; CHECK-RV64-NEXT: j .LBB61_873 +; CHECK-RV64-NEXT: .LBB61_367: # %else1422 +; CHECK-RV64-NEXT: slli a2, a1, 26 +; CHECK-RV64-NEXT: bgez a2, .LBB61_368 +; CHECK-RV64-NEXT: j .LBB61_874 +; CHECK-RV64-NEXT: .LBB61_368: # %else1426 +; CHECK-RV64-NEXT: slli a2, a1, 25 +; CHECK-RV64-NEXT: bgez a2, .LBB61_369 +; CHECK-RV64-NEXT: j .LBB61_875 +; CHECK-RV64-NEXT: .LBB61_369: # %else1430 +; CHECK-RV64-NEXT: slli a2, a1, 24 +; CHECK-RV64-NEXT: bgez a2, .LBB61_370 +; CHECK-RV64-NEXT: j .LBB61_876 +; CHECK-RV64-NEXT: .LBB61_370: # %else1434 +; CHECK-RV64-NEXT: slli a2, a1, 23 +; CHECK-RV64-NEXT: bgez a2, .LBB61_371 +; CHECK-RV64-NEXT: j .LBB61_877 +; CHECK-RV64-NEXT: .LBB61_371: # %else1438 +; CHECK-RV64-NEXT: slli a2, a1, 22 +; CHECK-RV64-NEXT: bgez a2, .LBB61_372 +; CHECK-RV64-NEXT: j .LBB61_878 +; CHECK-RV64-NEXT: .LBB61_372: # %else1442 +; CHECK-RV64-NEXT: slli a2, a1, 21 +; CHECK-RV64-NEXT: bgez a2, .LBB61_373 +; CHECK-RV64-NEXT: j .LBB61_879 +; CHECK-RV64-NEXT: .LBB61_373: # %else1446 +; CHECK-RV64-NEXT: slli a2, a1, 20 +; CHECK-RV64-NEXT: bgez a2, .LBB61_374 +; CHECK-RV64-NEXT: j .LBB61_880 +; CHECK-RV64-NEXT: .LBB61_374: # %else1450 +; CHECK-RV64-NEXT: slli a2, a1, 19 +; CHECK-RV64-NEXT: bgez a2, .LBB61_375 +; CHECK-RV64-NEXT: j .LBB61_881 +; CHECK-RV64-NEXT: .LBB61_375: # %else1454 +; CHECK-RV64-NEXT: slli a2, a1, 18 +; CHECK-RV64-NEXT: bgez a2, .LBB61_376 +; CHECK-RV64-NEXT: j .LBB61_882 +; CHECK-RV64-NEXT: .LBB61_376: # %else1458 +; CHECK-RV64-NEXT: slli a2, a1, 17 +; CHECK-RV64-NEXT: bgez a2, .LBB61_377 +; CHECK-RV64-NEXT: j .LBB61_883 +; CHECK-RV64-NEXT: .LBB61_377: # %else1462 +; CHECK-RV64-NEXT: slli a2, a1, 16 +; CHECK-RV64-NEXT: bgez a2, .LBB61_378 +; CHECK-RV64-NEXT: j .LBB61_884 +; CHECK-RV64-NEXT: .LBB61_378: # %else1466 +; CHECK-RV64-NEXT: slli a2, a1, 15 +; CHECK-RV64-NEXT: bgez a2, .LBB61_379 +; CHECK-RV64-NEXT: j .LBB61_885 +; CHECK-RV64-NEXT: .LBB61_379: # %else1470 +; CHECK-RV64-NEXT: slli a2, a1, 14 +; CHECK-RV64-NEXT: bgez a2, .LBB61_380 +; CHECK-RV64-NEXT: j .LBB61_886 +; CHECK-RV64-NEXT: .LBB61_380: # %else1474 +; CHECK-RV64-NEXT: slli a2, a1, 13 +; CHECK-RV64-NEXT: bgez a2, .LBB61_381 +; CHECK-RV64-NEXT: j .LBB61_887 +; CHECK-RV64-NEXT: .LBB61_381: # %else1478 +; CHECK-RV64-NEXT: slli a2, a1, 12 +; CHECK-RV64-NEXT: bgez a2, .LBB61_382 +; CHECK-RV64-NEXT: j .LBB61_888 +; CHECK-RV64-NEXT: .LBB61_382: # %else1482 +; CHECK-RV64-NEXT: slli a2, a1, 11 +; CHECK-RV64-NEXT: bgez a2, .LBB61_383 +; CHECK-RV64-NEXT: j .LBB61_889 +; CHECK-RV64-NEXT: .LBB61_383: # %else1486 +; CHECK-RV64-NEXT: slli a2, a1, 10 +; CHECK-RV64-NEXT: bgez a2, .LBB61_384 +; CHECK-RV64-NEXT: j .LBB61_890 +; CHECK-RV64-NEXT: .LBB61_384: # %else1490 +; CHECK-RV64-NEXT: slli a2, a1, 9 +; CHECK-RV64-NEXT: bgez a2, .LBB61_385 +; CHECK-RV64-NEXT: j .LBB61_891 +; CHECK-RV64-NEXT: .LBB61_385: # %else1494 +; CHECK-RV64-NEXT: slli a2, a1, 8 +; CHECK-RV64-NEXT: bgez a2, .LBB61_386 +; CHECK-RV64-NEXT: j .LBB61_892 +; CHECK-RV64-NEXT: .LBB61_386: # %else1498 +; CHECK-RV64-NEXT: slli a2, a1, 7 +; CHECK-RV64-NEXT: bgez a2, .LBB61_387 +; CHECK-RV64-NEXT: j .LBB61_893 +; CHECK-RV64-NEXT: .LBB61_387: # %else1502 +; CHECK-RV64-NEXT: slli a2, a1, 6 +; CHECK-RV64-NEXT: bgez a2, .LBB61_388 +; CHECK-RV64-NEXT: j .LBB61_894 +; CHECK-RV64-NEXT: .LBB61_388: # %else1506 +; CHECK-RV64-NEXT: slli a2, a1, 5 +; CHECK-RV64-NEXT: bgez a2, .LBB61_389 +; CHECK-RV64-NEXT: j .LBB61_895 +; CHECK-RV64-NEXT: .LBB61_389: # %else1510 +; CHECK-RV64-NEXT: slli a2, a1, 4 +; CHECK-RV64-NEXT: bgez a2, .LBB61_390 +; CHECK-RV64-NEXT: j .LBB61_896 +; CHECK-RV64-NEXT: .LBB61_390: # %else1514 +; CHECK-RV64-NEXT: slli a2, a1, 3 +; CHECK-RV64-NEXT: bgez a2, .LBB61_391 +; CHECK-RV64-NEXT: j .LBB61_897 +; CHECK-RV64-NEXT: .LBB61_391: # %else1518 +; CHECK-RV64-NEXT: slli a2, a1, 2 +; CHECK-RV64-NEXT: bgez a2, .LBB61_393 +; CHECK-RV64-NEXT: .LBB61_392: # %cond.load1521 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 382 +; CHECK-RV64-NEXT: li a3, 381 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: .LBB61_393: # %else1522 +; CHECK-RV64-NEXT: slli a2, a1, 1 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 6 +; CHECK-RV64-NEXT: bgez a2, .LBB61_395 +; CHECK-RV64-NEXT: # %bb.394: # %cond.load1525 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: li a2, 383 +; CHECK-RV64-NEXT: li a3, 382 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: .LBB61_395: # %else1526 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a2, v16 +; CHECK-RV64-NEXT: bgez a1, .LBB61_396 +; CHECK-RV64-NEXT: j .LBB61_898 +; CHECK-RV64-NEXT: .LBB61_396: # %else1530 +; CHECK-RV64-NEXT: andi a1, a2, 1 +; CHECK-RV64-NEXT: beqz a1, .LBB61_397 +; CHECK-RV64-NEXT: j .LBB61_899 +; CHECK-RV64-NEXT: .LBB61_397: # %else1534 +; CHECK-RV64-NEXT: andi a1, a2, 2 +; CHECK-RV64-NEXT: beqz a1, .LBB61_398 +; CHECK-RV64-NEXT: j .LBB61_900 +; CHECK-RV64-NEXT: .LBB61_398: # %else1538 +; CHECK-RV64-NEXT: andi a1, a2, 4 +; CHECK-RV64-NEXT: beqz a1, .LBB61_399 +; CHECK-RV64-NEXT: j .LBB61_901 +; CHECK-RV64-NEXT: .LBB61_399: # %else1542 +; CHECK-RV64-NEXT: andi a1, a2, 8 +; CHECK-RV64-NEXT: beqz a1, .LBB61_400 +; CHECK-RV64-NEXT: j .LBB61_902 +; CHECK-RV64-NEXT: .LBB61_400: # %else1546 +; CHECK-RV64-NEXT: andi a1, a2, 16 +; CHECK-RV64-NEXT: beqz a1, .LBB61_401 +; CHECK-RV64-NEXT: j .LBB61_903 +; CHECK-RV64-NEXT: .LBB61_401: # %else1550 +; CHECK-RV64-NEXT: andi a1, a2, 32 +; CHECK-RV64-NEXT: beqz a1, .LBB61_402 +; CHECK-RV64-NEXT: j .LBB61_904 +; CHECK-RV64-NEXT: .LBB61_402: # %else1554 +; CHECK-RV64-NEXT: andi a1, a2, 64 +; CHECK-RV64-NEXT: beqz a1, .LBB61_403 +; CHECK-RV64-NEXT: j .LBB61_905 +; CHECK-RV64-NEXT: .LBB61_403: # %else1558 +; CHECK-RV64-NEXT: andi a1, a2, 128 +; CHECK-RV64-NEXT: beqz a1, .LBB61_404 +; CHECK-RV64-NEXT: j .LBB61_906 +; CHECK-RV64-NEXT: .LBB61_404: # %else1562 +; CHECK-RV64-NEXT: andi a1, a2, 256 +; CHECK-RV64-NEXT: beqz a1, .LBB61_405 +; CHECK-RV64-NEXT: j .LBB61_907 +; CHECK-RV64-NEXT: .LBB61_405: # %else1566 +; CHECK-RV64-NEXT: andi a1, a2, 512 +; CHECK-RV64-NEXT: beqz a1, .LBB61_406 +; CHECK-RV64-NEXT: j .LBB61_908 +; CHECK-RV64-NEXT: .LBB61_406: # %else1570 +; CHECK-RV64-NEXT: andi a1, a2, 1024 +; CHECK-RV64-NEXT: beqz a1, .LBB61_407 +; CHECK-RV64-NEXT: j .LBB61_909 +; CHECK-RV64-NEXT: .LBB61_407: # %else1574 +; CHECK-RV64-NEXT: slli a1, a2, 52 +; CHECK-RV64-NEXT: bgez a1, .LBB61_408 +; CHECK-RV64-NEXT: j .LBB61_910 +; CHECK-RV64-NEXT: .LBB61_408: # %else1578 +; CHECK-RV64-NEXT: slli a1, a2, 51 +; CHECK-RV64-NEXT: bgez a1, .LBB61_409 +; CHECK-RV64-NEXT: j .LBB61_911 +; CHECK-RV64-NEXT: .LBB61_409: # %else1582 +; CHECK-RV64-NEXT: slli a1, a2, 50 +; CHECK-RV64-NEXT: bgez a1, .LBB61_410 +; CHECK-RV64-NEXT: j .LBB61_912 +; CHECK-RV64-NEXT: .LBB61_410: # %else1586 +; CHECK-RV64-NEXT: slli a1, a2, 49 +; CHECK-RV64-NEXT: bgez a1, .LBB61_411 +; CHECK-RV64-NEXT: j .LBB61_913 +; CHECK-RV64-NEXT: .LBB61_411: # %else1590 +; CHECK-RV64-NEXT: slli a1, a2, 48 +; CHECK-RV64-NEXT: bgez a1, .LBB61_412 +; CHECK-RV64-NEXT: j .LBB61_914 +; CHECK-RV64-NEXT: .LBB61_412: # %else1594 +; CHECK-RV64-NEXT: slli a1, a2, 47 +; CHECK-RV64-NEXT: bgez a1, .LBB61_413 +; CHECK-RV64-NEXT: j .LBB61_915 +; CHECK-RV64-NEXT: .LBB61_413: # %else1598 +; CHECK-RV64-NEXT: slli a1, a2, 46 +; CHECK-RV64-NEXT: bgez a1, .LBB61_414 +; CHECK-RV64-NEXT: j .LBB61_916 +; CHECK-RV64-NEXT: .LBB61_414: # %else1602 +; CHECK-RV64-NEXT: slli a1, a2, 45 +; CHECK-RV64-NEXT: bgez a1, .LBB61_415 +; CHECK-RV64-NEXT: j .LBB61_917 +; CHECK-RV64-NEXT: .LBB61_415: # %else1606 +; CHECK-RV64-NEXT: slli a1, a2, 44 +; CHECK-RV64-NEXT: bgez a1, .LBB61_416 +; CHECK-RV64-NEXT: j .LBB61_918 +; CHECK-RV64-NEXT: .LBB61_416: # %else1610 +; CHECK-RV64-NEXT: slli a1, a2, 43 +; CHECK-RV64-NEXT: bgez a1, .LBB61_417 +; CHECK-RV64-NEXT: j .LBB61_919 +; CHECK-RV64-NEXT: .LBB61_417: # %else1614 +; CHECK-RV64-NEXT: slli a1, a2, 42 +; CHECK-RV64-NEXT: bgez a1, .LBB61_418 +; CHECK-RV64-NEXT: j .LBB61_920 +; CHECK-RV64-NEXT: .LBB61_418: # %else1618 +; CHECK-RV64-NEXT: slli a1, a2, 41 +; CHECK-RV64-NEXT: bgez a1, .LBB61_419 +; CHECK-RV64-NEXT: j .LBB61_921 +; CHECK-RV64-NEXT: .LBB61_419: # %else1622 +; CHECK-RV64-NEXT: slli a1, a2, 40 +; CHECK-RV64-NEXT: bgez a1, .LBB61_420 +; CHECK-RV64-NEXT: j .LBB61_922 +; CHECK-RV64-NEXT: .LBB61_420: # %else1626 +; CHECK-RV64-NEXT: slli a1, a2, 39 +; CHECK-RV64-NEXT: bgez a1, .LBB61_421 +; CHECK-RV64-NEXT: j .LBB61_923 +; CHECK-RV64-NEXT: .LBB61_421: # %else1630 +; CHECK-RV64-NEXT: slli a1, a2, 38 +; CHECK-RV64-NEXT: bgez a1, .LBB61_422 +; CHECK-RV64-NEXT: j .LBB61_924 +; CHECK-RV64-NEXT: .LBB61_422: # %else1634 +; CHECK-RV64-NEXT: slli a1, a2, 37 +; CHECK-RV64-NEXT: bgez a1, .LBB61_423 +; CHECK-RV64-NEXT: j .LBB61_925 +; CHECK-RV64-NEXT: .LBB61_423: # %else1638 +; CHECK-RV64-NEXT: slli a1, a2, 36 +; CHECK-RV64-NEXT: bgez a1, .LBB61_424 +; CHECK-RV64-NEXT: j .LBB61_926 +; CHECK-RV64-NEXT: .LBB61_424: # %else1642 +; CHECK-RV64-NEXT: slli a1, a2, 35 +; CHECK-RV64-NEXT: bgez a1, .LBB61_425 +; CHECK-RV64-NEXT: j .LBB61_927 +; CHECK-RV64-NEXT: .LBB61_425: # %else1646 +; CHECK-RV64-NEXT: slli a1, a2, 34 +; CHECK-RV64-NEXT: bgez a1, .LBB61_426 +; CHECK-RV64-NEXT: j .LBB61_928 +; CHECK-RV64-NEXT: .LBB61_426: # %else1650 +; CHECK-RV64-NEXT: slli a1, a2, 33 +; CHECK-RV64-NEXT: bgez a1, .LBB61_427 +; CHECK-RV64-NEXT: j .LBB61_929 +; CHECK-RV64-NEXT: .LBB61_427: # %else1654 +; CHECK-RV64-NEXT: slli a1, a2, 32 +; CHECK-RV64-NEXT: bgez a1, .LBB61_428 +; CHECK-RV64-NEXT: j .LBB61_930 +; CHECK-RV64-NEXT: .LBB61_428: # %else1658 +; CHECK-RV64-NEXT: slli a1, a2, 31 +; CHECK-RV64-NEXT: bgez a1, .LBB61_429 +; CHECK-RV64-NEXT: j .LBB61_931 +; CHECK-RV64-NEXT: .LBB61_429: # %else1662 +; CHECK-RV64-NEXT: slli a1, a2, 30 +; CHECK-RV64-NEXT: bgez a1, .LBB61_430 +; CHECK-RV64-NEXT: j .LBB61_932 +; CHECK-RV64-NEXT: .LBB61_430: # %else1666 +; CHECK-RV64-NEXT: slli a1, a2, 29 +; CHECK-RV64-NEXT: bgez a1, .LBB61_431 +; CHECK-RV64-NEXT: j .LBB61_933 +; CHECK-RV64-NEXT: .LBB61_431: # %else1670 +; CHECK-RV64-NEXT: slli a1, a2, 28 +; CHECK-RV64-NEXT: bgez a1, .LBB61_432 +; CHECK-RV64-NEXT: j .LBB61_934 +; CHECK-RV64-NEXT: .LBB61_432: # %else1674 +; CHECK-RV64-NEXT: slli a1, a2, 27 +; CHECK-RV64-NEXT: bgez a1, .LBB61_433 +; CHECK-RV64-NEXT: j .LBB61_935 +; CHECK-RV64-NEXT: .LBB61_433: # %else1678 +; CHECK-RV64-NEXT: slli a1, a2, 26 +; CHECK-RV64-NEXT: bgez a1, .LBB61_434 +; CHECK-RV64-NEXT: j .LBB61_936 +; CHECK-RV64-NEXT: .LBB61_434: # %else1682 +; CHECK-RV64-NEXT: slli a1, a2, 25 +; CHECK-RV64-NEXT: bgez a1, .LBB61_435 +; CHECK-RV64-NEXT: j .LBB61_937 +; CHECK-RV64-NEXT: .LBB61_435: # %else1686 +; CHECK-RV64-NEXT: slli a1, a2, 24 +; CHECK-RV64-NEXT: bgez a1, .LBB61_436 +; CHECK-RV64-NEXT: j .LBB61_938 +; CHECK-RV64-NEXT: .LBB61_436: # %else1690 +; CHECK-RV64-NEXT: slli a1, a2, 23 +; CHECK-RV64-NEXT: bgez a1, .LBB61_437 +; CHECK-RV64-NEXT: j .LBB61_939 +; CHECK-RV64-NEXT: .LBB61_437: # %else1694 +; CHECK-RV64-NEXT: slli a1, a2, 22 +; CHECK-RV64-NEXT: bgez a1, .LBB61_438 +; CHECK-RV64-NEXT: j .LBB61_940 +; CHECK-RV64-NEXT: .LBB61_438: # %else1698 +; CHECK-RV64-NEXT: slli a1, a2, 21 +; CHECK-RV64-NEXT: bgez a1, .LBB61_439 +; CHECK-RV64-NEXT: j .LBB61_941 +; CHECK-RV64-NEXT: .LBB61_439: # %else1702 +; CHECK-RV64-NEXT: slli a1, a2, 20 +; CHECK-RV64-NEXT: bgez a1, .LBB61_440 +; CHECK-RV64-NEXT: j .LBB61_942 +; CHECK-RV64-NEXT: .LBB61_440: # %else1706 +; CHECK-RV64-NEXT: slli a1, a2, 19 +; CHECK-RV64-NEXT: bgez a1, .LBB61_441 +; CHECK-RV64-NEXT: j .LBB61_943 +; CHECK-RV64-NEXT: .LBB61_441: # %else1710 +; CHECK-RV64-NEXT: slli a1, a2, 18 +; CHECK-RV64-NEXT: bgez a1, .LBB61_442 +; CHECK-RV64-NEXT: j .LBB61_944 +; CHECK-RV64-NEXT: .LBB61_442: # %else1714 +; CHECK-RV64-NEXT: slli a1, a2, 17 +; CHECK-RV64-NEXT: bgez a1, .LBB61_443 +; CHECK-RV64-NEXT: j .LBB61_945 +; CHECK-RV64-NEXT: .LBB61_443: # %else1718 +; CHECK-RV64-NEXT: slli a1, a2, 16 +; CHECK-RV64-NEXT: bgez a1, .LBB61_444 +; CHECK-RV64-NEXT: j .LBB61_946 +; CHECK-RV64-NEXT: .LBB61_444: # %else1722 +; CHECK-RV64-NEXT: slli a1, a2, 15 +; CHECK-RV64-NEXT: bgez a1, .LBB61_445 +; CHECK-RV64-NEXT: j .LBB61_947 +; CHECK-RV64-NEXT: .LBB61_445: # %else1726 +; CHECK-RV64-NEXT: slli a1, a2, 14 +; CHECK-RV64-NEXT: bgez a1, .LBB61_446 +; CHECK-RV64-NEXT: j .LBB61_948 +; CHECK-RV64-NEXT: .LBB61_446: # %else1730 +; CHECK-RV64-NEXT: slli a1, a2, 13 +; CHECK-RV64-NEXT: bgez a1, .LBB61_447 +; CHECK-RV64-NEXT: j .LBB61_949 +; CHECK-RV64-NEXT: .LBB61_447: # %else1734 +; CHECK-RV64-NEXT: slli a1, a2, 12 +; CHECK-RV64-NEXT: bgez a1, .LBB61_448 +; CHECK-RV64-NEXT: j .LBB61_950 +; CHECK-RV64-NEXT: .LBB61_448: # %else1738 +; CHECK-RV64-NEXT: slli a1, a2, 11 +; CHECK-RV64-NEXT: bgez a1, .LBB61_449 +; CHECK-RV64-NEXT: j .LBB61_951 +; CHECK-RV64-NEXT: .LBB61_449: # %else1742 +; CHECK-RV64-NEXT: slli a1, a2, 10 +; CHECK-RV64-NEXT: bgez a1, .LBB61_450 +; CHECK-RV64-NEXT: j .LBB61_952 +; CHECK-RV64-NEXT: .LBB61_450: # %else1746 +; CHECK-RV64-NEXT: slli a1, a2, 9 +; CHECK-RV64-NEXT: bgez a1, .LBB61_451 +; CHECK-RV64-NEXT: j .LBB61_953 +; CHECK-RV64-NEXT: .LBB61_451: # %else1750 +; CHECK-RV64-NEXT: slli a1, a2, 8 +; CHECK-RV64-NEXT: bgez a1, .LBB61_452 +; CHECK-RV64-NEXT: j .LBB61_954 +; CHECK-RV64-NEXT: .LBB61_452: # %else1754 +; CHECK-RV64-NEXT: slli a1, a2, 7 +; CHECK-RV64-NEXT: bgez a1, .LBB61_453 +; CHECK-RV64-NEXT: j .LBB61_955 +; CHECK-RV64-NEXT: .LBB61_453: # %else1758 +; CHECK-RV64-NEXT: slli a1, a2, 6 +; CHECK-RV64-NEXT: bgez a1, .LBB61_454 +; CHECK-RV64-NEXT: j .LBB61_956 +; CHECK-RV64-NEXT: .LBB61_454: # %else1762 +; CHECK-RV64-NEXT: slli a1, a2, 5 +; CHECK-RV64-NEXT: bgez a1, .LBB61_455 +; CHECK-RV64-NEXT: j .LBB61_957 +; CHECK-RV64-NEXT: .LBB61_455: # %else1766 +; CHECK-RV64-NEXT: slli a1, a2, 4 +; CHECK-RV64-NEXT: bgez a1, .LBB61_456 +; CHECK-RV64-NEXT: j .LBB61_958 +; CHECK-RV64-NEXT: .LBB61_456: # %else1770 +; CHECK-RV64-NEXT: slli a1, a2, 3 +; CHECK-RV64-NEXT: bgez a1, .LBB61_457 +; CHECK-RV64-NEXT: j .LBB61_959 +; CHECK-RV64-NEXT: .LBB61_457: # %else1774 +; CHECK-RV64-NEXT: slli a1, a2, 2 +; CHECK-RV64-NEXT: bgez a1, .LBB61_459 +; CHECK-RV64-NEXT: .LBB61_458: # %cond.load1777 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 446 +; CHECK-RV64-NEXT: li a3, 445 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: .LBB61_459: # %else1778 +; CHECK-RV64-NEXT: slli a1, a2, 1 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vslidedown.vi v16, v0, 7 +; CHECK-RV64-NEXT: bgez a1, .LBB61_461 +; CHECK-RV64-NEXT: # %bb.460: # %cond.load1781 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: li a1, 447 +; CHECK-RV64-NEXT: li a3, 446 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: .LBB61_461: # %else1782 +; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-RV64-NEXT: vmv.x.s a1, v16 +; CHECK-RV64-NEXT: bgez a2, .LBB61_462 +; CHECK-RV64-NEXT: j .LBB61_960 +; CHECK-RV64-NEXT: .LBB61_462: # %else1786 +; CHECK-RV64-NEXT: andi a2, a1, 1 +; CHECK-RV64-NEXT: beqz a2, .LBB61_463 +; CHECK-RV64-NEXT: j .LBB61_961 +; CHECK-RV64-NEXT: .LBB61_463: # %else1790 +; CHECK-RV64-NEXT: andi a2, a1, 2 +; CHECK-RV64-NEXT: beqz a2, .LBB61_464 +; CHECK-RV64-NEXT: j .LBB61_962 +; CHECK-RV64-NEXT: .LBB61_464: # %else1794 +; CHECK-RV64-NEXT: andi a2, a1, 4 +; CHECK-RV64-NEXT: beqz a2, .LBB61_465 +; CHECK-RV64-NEXT: j .LBB61_963 +; CHECK-RV64-NEXT: .LBB61_465: # %else1798 +; CHECK-RV64-NEXT: andi a2, a1, 8 +; CHECK-RV64-NEXT: beqz a2, .LBB61_466 +; CHECK-RV64-NEXT: j .LBB61_964 +; CHECK-RV64-NEXT: .LBB61_466: # %else1802 +; CHECK-RV64-NEXT: andi a2, a1, 16 +; CHECK-RV64-NEXT: beqz a2, .LBB61_467 +; CHECK-RV64-NEXT: j .LBB61_965 +; CHECK-RV64-NEXT: .LBB61_467: # %else1806 +; CHECK-RV64-NEXT: andi a2, a1, 32 +; CHECK-RV64-NEXT: beqz a2, .LBB61_468 +; CHECK-RV64-NEXT: j .LBB61_966 +; CHECK-RV64-NEXT: .LBB61_468: # %else1810 +; CHECK-RV64-NEXT: andi a2, a1, 64 +; CHECK-RV64-NEXT: beqz a2, .LBB61_469 +; CHECK-RV64-NEXT: j .LBB61_967 +; CHECK-RV64-NEXT: .LBB61_469: # %else1814 +; CHECK-RV64-NEXT: andi a2, a1, 128 +; CHECK-RV64-NEXT: beqz a2, .LBB61_470 +; CHECK-RV64-NEXT: j .LBB61_968 +; CHECK-RV64-NEXT: .LBB61_470: # %else1818 +; CHECK-RV64-NEXT: andi a2, a1, 256 +; CHECK-RV64-NEXT: beqz a2, .LBB61_471 +; CHECK-RV64-NEXT: j .LBB61_969 +; CHECK-RV64-NEXT: .LBB61_471: # %else1822 +; CHECK-RV64-NEXT: andi a2, a1, 512 +; CHECK-RV64-NEXT: beqz a2, .LBB61_472 +; CHECK-RV64-NEXT: j .LBB61_970 +; CHECK-RV64-NEXT: .LBB61_472: # %else1826 +; CHECK-RV64-NEXT: andi a2, a1, 1024 +; CHECK-RV64-NEXT: beqz a2, .LBB61_473 +; CHECK-RV64-NEXT: j .LBB61_971 +; CHECK-RV64-NEXT: .LBB61_473: # %else1830 +; CHECK-RV64-NEXT: slli a2, a1, 52 +; CHECK-RV64-NEXT: bgez a2, .LBB61_474 +; CHECK-RV64-NEXT: j .LBB61_972 +; CHECK-RV64-NEXT: .LBB61_474: # %else1834 +; CHECK-RV64-NEXT: slli a2, a1, 51 +; CHECK-RV64-NEXT: bgez a2, .LBB61_475 +; CHECK-RV64-NEXT: j .LBB61_973 +; CHECK-RV64-NEXT: .LBB61_475: # %else1838 +; CHECK-RV64-NEXT: slli a2, a1, 50 +; CHECK-RV64-NEXT: bgez a2, .LBB61_476 +; CHECK-RV64-NEXT: j .LBB61_974 +; CHECK-RV64-NEXT: .LBB61_476: # %else1842 +; CHECK-RV64-NEXT: slli a2, a1, 49 +; CHECK-RV64-NEXT: bgez a2, .LBB61_477 +; CHECK-RV64-NEXT: j .LBB61_975 +; CHECK-RV64-NEXT: .LBB61_477: # %else1846 +; CHECK-RV64-NEXT: slli a2, a1, 48 +; CHECK-RV64-NEXT: bgez a2, .LBB61_478 +; CHECK-RV64-NEXT: j .LBB61_976 +; CHECK-RV64-NEXT: .LBB61_478: # %else1850 +; CHECK-RV64-NEXT: slli a2, a1, 47 +; CHECK-RV64-NEXT: bgez a2, .LBB61_479 +; CHECK-RV64-NEXT: j .LBB61_977 +; CHECK-RV64-NEXT: .LBB61_479: # %else1854 +; CHECK-RV64-NEXT: slli a2, a1, 46 +; CHECK-RV64-NEXT: bgez a2, .LBB61_480 +; CHECK-RV64-NEXT: j .LBB61_978 +; CHECK-RV64-NEXT: .LBB61_480: # %else1858 +; CHECK-RV64-NEXT: slli a2, a1, 45 +; CHECK-RV64-NEXT: bgez a2, .LBB61_481 +; CHECK-RV64-NEXT: j .LBB61_979 +; CHECK-RV64-NEXT: .LBB61_481: # %else1862 +; CHECK-RV64-NEXT: slli a2, a1, 44 +; CHECK-RV64-NEXT: bgez a2, .LBB61_482 +; CHECK-RV64-NEXT: j .LBB61_980 +; CHECK-RV64-NEXT: .LBB61_482: # %else1866 +; CHECK-RV64-NEXT: slli a2, a1, 43 +; CHECK-RV64-NEXT: bgez a2, .LBB61_483 +; CHECK-RV64-NEXT: j .LBB61_981 +; CHECK-RV64-NEXT: .LBB61_483: # %else1870 +; CHECK-RV64-NEXT: slli a2, a1, 42 +; CHECK-RV64-NEXT: bgez a2, .LBB61_484 +; CHECK-RV64-NEXT: j .LBB61_982 +; CHECK-RV64-NEXT: .LBB61_484: # %else1874 +; CHECK-RV64-NEXT: slli a2, a1, 41 +; CHECK-RV64-NEXT: bgez a2, .LBB61_485 +; CHECK-RV64-NEXT: j .LBB61_983 +; CHECK-RV64-NEXT: .LBB61_485: # %else1878 +; CHECK-RV64-NEXT: slli a2, a1, 40 +; CHECK-RV64-NEXT: bgez a2, .LBB61_486 +; CHECK-RV64-NEXT: j .LBB61_984 +; CHECK-RV64-NEXT: .LBB61_486: # %else1882 +; CHECK-RV64-NEXT: slli a2, a1, 39 +; CHECK-RV64-NEXT: bgez a2, .LBB61_487 +; CHECK-RV64-NEXT: j .LBB61_985 +; CHECK-RV64-NEXT: .LBB61_487: # %else1886 +; CHECK-RV64-NEXT: slli a2, a1, 38 +; CHECK-RV64-NEXT: bgez a2, .LBB61_488 +; CHECK-RV64-NEXT: j .LBB61_986 +; CHECK-RV64-NEXT: .LBB61_488: # %else1890 +; CHECK-RV64-NEXT: slli a2, a1, 37 +; CHECK-RV64-NEXT: bgez a2, .LBB61_489 +; CHECK-RV64-NEXT: j .LBB61_987 +; CHECK-RV64-NEXT: .LBB61_489: # %else1894 +; CHECK-RV64-NEXT: slli a2, a1, 36 +; CHECK-RV64-NEXT: bgez a2, .LBB61_490 +; CHECK-RV64-NEXT: j .LBB61_988 +; CHECK-RV64-NEXT: .LBB61_490: # %else1898 +; CHECK-RV64-NEXT: slli a2, a1, 35 +; CHECK-RV64-NEXT: bgez a2, .LBB61_491 +; CHECK-RV64-NEXT: j .LBB61_989 +; CHECK-RV64-NEXT: .LBB61_491: # %else1902 +; CHECK-RV64-NEXT: slli a2, a1, 34 +; CHECK-RV64-NEXT: bgez a2, .LBB61_492 +; CHECK-RV64-NEXT: j .LBB61_990 +; CHECK-RV64-NEXT: .LBB61_492: # %else1906 +; CHECK-RV64-NEXT: slli a2, a1, 33 +; CHECK-RV64-NEXT: bgez a2, .LBB61_493 +; CHECK-RV64-NEXT: j .LBB61_991 +; CHECK-RV64-NEXT: .LBB61_493: # %else1910 +; CHECK-RV64-NEXT: slli a2, a1, 32 +; CHECK-RV64-NEXT: bgez a2, .LBB61_494 +; CHECK-RV64-NEXT: j .LBB61_992 +; CHECK-RV64-NEXT: .LBB61_494: # %else1914 +; CHECK-RV64-NEXT: slli a2, a1, 31 +; CHECK-RV64-NEXT: bgez a2, .LBB61_495 +; CHECK-RV64-NEXT: j .LBB61_993 +; CHECK-RV64-NEXT: .LBB61_495: # %else1918 +; CHECK-RV64-NEXT: slli a2, a1, 30 +; CHECK-RV64-NEXT: bgez a2, .LBB61_496 +; CHECK-RV64-NEXT: j .LBB61_994 +; CHECK-RV64-NEXT: .LBB61_496: # %else1922 +; CHECK-RV64-NEXT: slli a2, a1, 29 +; CHECK-RV64-NEXT: bgez a2, .LBB61_497 +; CHECK-RV64-NEXT: j .LBB61_995 +; CHECK-RV64-NEXT: .LBB61_497: # %else1926 +; CHECK-RV64-NEXT: slli a2, a1, 28 +; CHECK-RV64-NEXT: bgez a2, .LBB61_498 +; CHECK-RV64-NEXT: j .LBB61_996 +; CHECK-RV64-NEXT: .LBB61_498: # %else1930 +; CHECK-RV64-NEXT: slli a2, a1, 27 +; CHECK-RV64-NEXT: bgez a2, .LBB61_499 +; CHECK-RV64-NEXT: j .LBB61_997 +; CHECK-RV64-NEXT: .LBB61_499: # %else1934 +; CHECK-RV64-NEXT: slli a2, a1, 26 +; CHECK-RV64-NEXT: bgez a2, .LBB61_500 +; CHECK-RV64-NEXT: j .LBB61_998 +; CHECK-RV64-NEXT: .LBB61_500: # %else1938 +; CHECK-RV64-NEXT: slli a2, a1, 25 +; CHECK-RV64-NEXT: bgez a2, .LBB61_501 +; CHECK-RV64-NEXT: j .LBB61_999 +; CHECK-RV64-NEXT: .LBB61_501: # %else1942 +; CHECK-RV64-NEXT: slli a2, a1, 24 +; CHECK-RV64-NEXT: bgez a2, .LBB61_502 +; CHECK-RV64-NEXT: j .LBB61_1000 +; CHECK-RV64-NEXT: .LBB61_502: # %else1946 +; CHECK-RV64-NEXT: slli a2, a1, 23 +; CHECK-RV64-NEXT: bgez a2, .LBB61_503 +; CHECK-RV64-NEXT: j .LBB61_1001 +; CHECK-RV64-NEXT: .LBB61_503: # %else1950 +; CHECK-RV64-NEXT: slli a2, a1, 22 +; CHECK-RV64-NEXT: bgez a2, .LBB61_504 +; CHECK-RV64-NEXT: j .LBB61_1002 +; CHECK-RV64-NEXT: .LBB61_504: # %else1954 +; CHECK-RV64-NEXT: slli a2, a1, 21 +; CHECK-RV64-NEXT: bgez a2, .LBB61_505 +; CHECK-RV64-NEXT: j .LBB61_1003 +; CHECK-RV64-NEXT: .LBB61_505: # %else1958 +; CHECK-RV64-NEXT: slli a2, a1, 20 +; CHECK-RV64-NEXT: bgez a2, .LBB61_506 +; CHECK-RV64-NEXT: j .LBB61_1004 +; CHECK-RV64-NEXT: .LBB61_506: # %else1962 +; CHECK-RV64-NEXT: slli a2, a1, 19 +; CHECK-RV64-NEXT: bgez a2, .LBB61_507 +; CHECK-RV64-NEXT: j .LBB61_1005 +; CHECK-RV64-NEXT: .LBB61_507: # %else1966 +; CHECK-RV64-NEXT: slli a2, a1, 18 +; CHECK-RV64-NEXT: bgez a2, .LBB61_508 +; CHECK-RV64-NEXT: j .LBB61_1006 +; CHECK-RV64-NEXT: .LBB61_508: # %else1970 +; CHECK-RV64-NEXT: slli a2, a1, 17 +; CHECK-RV64-NEXT: bgez a2, .LBB61_509 +; CHECK-RV64-NEXT: j .LBB61_1007 +; CHECK-RV64-NEXT: .LBB61_509: # %else1974 +; CHECK-RV64-NEXT: slli a2, a1, 16 +; CHECK-RV64-NEXT: bgez a2, .LBB61_510 +; CHECK-RV64-NEXT: j .LBB61_1008 +; CHECK-RV64-NEXT: .LBB61_510: # %else1978 +; CHECK-RV64-NEXT: slli a2, a1, 15 +; CHECK-RV64-NEXT: bgez a2, .LBB61_511 +; CHECK-RV64-NEXT: j .LBB61_1009 +; CHECK-RV64-NEXT: .LBB61_511: # %else1982 +; CHECK-RV64-NEXT: slli a2, a1, 14 +; CHECK-RV64-NEXT: bgez a2, .LBB61_512 +; CHECK-RV64-NEXT: j .LBB61_1010 +; CHECK-RV64-NEXT: .LBB61_512: # %else1986 +; CHECK-RV64-NEXT: slli a2, a1, 13 +; CHECK-RV64-NEXT: bgez a2, .LBB61_513 +; CHECK-RV64-NEXT: j .LBB61_1011 +; CHECK-RV64-NEXT: .LBB61_513: # %else1990 +; CHECK-RV64-NEXT: slli a2, a1, 12 +; CHECK-RV64-NEXT: bgez a2, .LBB61_514 +; CHECK-RV64-NEXT: j .LBB61_1012 +; CHECK-RV64-NEXT: .LBB61_514: # %else1994 +; CHECK-RV64-NEXT: slli a2, a1, 11 +; CHECK-RV64-NEXT: bgez a2, .LBB61_515 +; CHECK-RV64-NEXT: j .LBB61_1013 +; CHECK-RV64-NEXT: .LBB61_515: # %else1998 +; CHECK-RV64-NEXT: slli a2, a1, 10 +; CHECK-RV64-NEXT: bgez a2, .LBB61_516 +; CHECK-RV64-NEXT: j .LBB61_1014 +; CHECK-RV64-NEXT: .LBB61_516: # %else2002 +; CHECK-RV64-NEXT: slli a2, a1, 9 +; CHECK-RV64-NEXT: bgez a2, .LBB61_517 +; CHECK-RV64-NEXT: j .LBB61_1015 +; CHECK-RV64-NEXT: .LBB61_517: # %else2006 +; CHECK-RV64-NEXT: slli a2, a1, 8 +; CHECK-RV64-NEXT: bgez a2, .LBB61_518 +; CHECK-RV64-NEXT: j .LBB61_1016 +; CHECK-RV64-NEXT: .LBB61_518: # %else2010 +; CHECK-RV64-NEXT: slli a2, a1, 7 +; CHECK-RV64-NEXT: bgez a2, .LBB61_519 +; CHECK-RV64-NEXT: j .LBB61_1017 +; CHECK-RV64-NEXT: .LBB61_519: # %else2014 +; CHECK-RV64-NEXT: slli a2, a1, 6 +; CHECK-RV64-NEXT: bgez a2, .LBB61_520 +; CHECK-RV64-NEXT: j .LBB61_1018 +; CHECK-RV64-NEXT: .LBB61_520: # %else2018 +; CHECK-RV64-NEXT: slli a2, a1, 5 +; CHECK-RV64-NEXT: bgez a2, .LBB61_521 +; CHECK-RV64-NEXT: j .LBB61_1019 +; CHECK-RV64-NEXT: .LBB61_521: # %else2022 +; CHECK-RV64-NEXT: slli a2, a1, 4 +; CHECK-RV64-NEXT: bgez a2, .LBB61_522 +; CHECK-RV64-NEXT: j .LBB61_1020 +; CHECK-RV64-NEXT: .LBB61_522: # %else2026 +; CHECK-RV64-NEXT: slli a2, a1, 3 +; CHECK-RV64-NEXT: bgez a2, .LBB61_523 +; CHECK-RV64-NEXT: j .LBB61_1021 +; CHECK-RV64-NEXT: .LBB61_523: # %else2030 +; CHECK-RV64-NEXT: slli a2, a1, 2 +; CHECK-RV64-NEXT: bgez a2, .LBB61_524 +; CHECK-RV64-NEXT: j .LBB61_1022 +; CHECK-RV64-NEXT: .LBB61_524: # %else2034 +; CHECK-RV64-NEXT: slli a2, a1, 1 +; CHECK-RV64-NEXT: bgez a2, .LBB61_525 +; CHECK-RV64-NEXT: j .LBB61_1023 +; CHECK-RV64-NEXT: .LBB61_525: # %else2038 +; CHECK-RV64-NEXT: bgez a1, .LBB61_526 +; CHECK-RV64-NEXT: j .LBB61_1024 +; CHECK-RV64-NEXT: .LBB61_526: # %else2042 +; CHECK-RV64-NEXT: ret +; CHECK-RV64-NEXT: .LBB61_527: # %cond.load +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v8, a1 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a1, a2, 2 +; CHECK-RV64-NEXT: bnez a1, .LBB61_528 +; CHECK-RV64-NEXT: j .LBB61_2 +; CHECK-RV64-NEXT: .LBB61_528: # %cond.load1 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vsetivli zero, 2, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 1 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 4 +; CHECK-RV64-NEXT: bnez a1, .LBB61_529 +; CHECK-RV64-NEXT: j .LBB61_3 +; CHECK-RV64-NEXT: .LBB61_529: # %cond.load5 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 3, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 2 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 8 +; CHECK-RV64-NEXT: bnez a1, .LBB61_530 +; CHECK-RV64-NEXT: j .LBB61_4 +; CHECK-RV64-NEXT: .LBB61_530: # %cond.load9 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 4, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 16 +; CHECK-RV64-NEXT: bnez a1, .LBB61_531 +; CHECK-RV64-NEXT: j .LBB61_5 +; CHECK-RV64-NEXT: .LBB61_531: # %cond.load13 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 5, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 4 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 32 +; CHECK-RV64-NEXT: bnez a1, .LBB61_532 +; CHECK-RV64-NEXT: j .LBB61_6 +; CHECK-RV64-NEXT: .LBB61_532: # %cond.load17 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 6, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 5 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 64 +; CHECK-RV64-NEXT: bnez a1, .LBB61_533 +; CHECK-RV64-NEXT: j .LBB61_7 +; CHECK-RV64-NEXT: .LBB61_533: # %cond.load21 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 7, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 6 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 128 +; CHECK-RV64-NEXT: bnez a1, .LBB61_534 +; CHECK-RV64-NEXT: j .LBB61_8 +; CHECK-RV64-NEXT: .LBB61_534: # %cond.load25 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 7 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 256 +; CHECK-RV64-NEXT: bnez a1, .LBB61_535 +; CHECK-RV64-NEXT: j .LBB61_9 +; CHECK-RV64-NEXT: .LBB61_535: # %cond.load29 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 9, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 8 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 512 +; CHECK-RV64-NEXT: bnez a1, .LBB61_536 +; CHECK-RV64-NEXT: j .LBB61_10 +; CHECK-RV64-NEXT: .LBB61_536: # %cond.load33 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 10, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 9 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 1024 +; CHECK-RV64-NEXT: bnez a1, .LBB61_537 +; CHECK-RV64-NEXT: j .LBB61_11 +; CHECK-RV64-NEXT: .LBB61_537: # %cond.load37 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 11, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 10 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 52 +; CHECK-RV64-NEXT: bltz a1, .LBB61_538 +; CHECK-RV64-NEXT: j .LBB61_12 +; CHECK-RV64-NEXT: .LBB61_538: # %cond.load41 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 12, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 11 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 51 +; CHECK-RV64-NEXT: bltz a1, .LBB61_539 +; CHECK-RV64-NEXT: j .LBB61_13 +; CHECK-RV64-NEXT: .LBB61_539: # %cond.load45 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 13, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 12 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 50 +; CHECK-RV64-NEXT: bltz a1, .LBB61_540 +; CHECK-RV64-NEXT: j .LBB61_14 +; CHECK-RV64-NEXT: .LBB61_540: # %cond.load49 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 14, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 13 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 49 +; CHECK-RV64-NEXT: bltz a1, .LBB61_541 +; CHECK-RV64-NEXT: j .LBB61_15 +; CHECK-RV64-NEXT: .LBB61_541: # %cond.load53 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 15, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 14 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 48 +; CHECK-RV64-NEXT: bltz a1, .LBB61_542 +; CHECK-RV64-NEXT: j .LBB61_16 +; CHECK-RV64-NEXT: .LBB61_542: # %cond.load57 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 16, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 15 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 47 +; CHECK-RV64-NEXT: bltz a1, .LBB61_543 +; CHECK-RV64-NEXT: j .LBB61_17 +; CHECK-RV64-NEXT: .LBB61_543: # %cond.load61 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 17, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 16 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 46 +; CHECK-RV64-NEXT: bltz a1, .LBB61_544 +; CHECK-RV64-NEXT: j .LBB61_18 +; CHECK-RV64-NEXT: .LBB61_544: # %cond.load65 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 18, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 17 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 45 +; CHECK-RV64-NEXT: bltz a1, .LBB61_545 +; CHECK-RV64-NEXT: j .LBB61_19 +; CHECK-RV64-NEXT: .LBB61_545: # %cond.load69 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 19, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 18 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 44 +; CHECK-RV64-NEXT: bltz a1, .LBB61_546 +; CHECK-RV64-NEXT: j .LBB61_20 +; CHECK-RV64-NEXT: .LBB61_546: # %cond.load73 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 20, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 19 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 43 +; CHECK-RV64-NEXT: bltz a1, .LBB61_547 +; CHECK-RV64-NEXT: j .LBB61_21 +; CHECK-RV64-NEXT: .LBB61_547: # %cond.load77 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 21, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 20 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 42 +; CHECK-RV64-NEXT: bltz a1, .LBB61_548 +; CHECK-RV64-NEXT: j .LBB61_22 +; CHECK-RV64-NEXT: .LBB61_548: # %cond.load81 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 22, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 21 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 41 +; CHECK-RV64-NEXT: bltz a1, .LBB61_549 +; CHECK-RV64-NEXT: j .LBB61_23 +; CHECK-RV64-NEXT: .LBB61_549: # %cond.load85 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 23, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 22 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 40 +; CHECK-RV64-NEXT: bltz a1, .LBB61_550 +; CHECK-RV64-NEXT: j .LBB61_24 +; CHECK-RV64-NEXT: .LBB61_550: # %cond.load89 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 24, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 23 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 39 +; CHECK-RV64-NEXT: bltz a1, .LBB61_551 +; CHECK-RV64-NEXT: j .LBB61_25 +; CHECK-RV64-NEXT: .LBB61_551: # %cond.load93 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 25, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 24 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 38 +; CHECK-RV64-NEXT: bltz a1, .LBB61_552 +; CHECK-RV64-NEXT: j .LBB61_26 +; CHECK-RV64-NEXT: .LBB61_552: # %cond.load97 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 26, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 25 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 37 +; CHECK-RV64-NEXT: bltz a1, .LBB61_553 +; CHECK-RV64-NEXT: j .LBB61_27 +; CHECK-RV64-NEXT: .LBB61_553: # %cond.load101 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 27, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 26 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 36 +; CHECK-RV64-NEXT: bltz a1, .LBB61_554 +; CHECK-RV64-NEXT: j .LBB61_28 +; CHECK-RV64-NEXT: .LBB61_554: # %cond.load105 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 28, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 27 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 35 +; CHECK-RV64-NEXT: bltz a1, .LBB61_555 +; CHECK-RV64-NEXT: j .LBB61_29 +; CHECK-RV64-NEXT: .LBB61_555: # %cond.load109 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 29, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 28 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 34 +; CHECK-RV64-NEXT: bltz a1, .LBB61_556 +; CHECK-RV64-NEXT: j .LBB61_30 +; CHECK-RV64-NEXT: .LBB61_556: # %cond.load113 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 30, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 29 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 33 +; CHECK-RV64-NEXT: bltz a1, .LBB61_557 +; CHECK-RV64-NEXT: j .LBB61_31 +; CHECK-RV64-NEXT: .LBB61_557: # %cond.load117 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vsetivli zero, 31, e8, m1, tu, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: vslideup.vi v8, v16, 30 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 32 +; CHECK-RV64-NEXT: bltz a1, .LBB61_558 +; CHECK-RV64-NEXT: j .LBB61_32 +; CHECK-RV64-NEXT: .LBB61_558: # %cond.load121 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 32 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vi v8, v24, 31 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 31 +; CHECK-RV64-NEXT: bltz a1, .LBB61_559 +; CHECK-RV64-NEXT: j .LBB61_33 +; CHECK-RV64-NEXT: .LBB61_559: # %cond.load125 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 33 +; CHECK-RV64-NEXT: li a3, 32 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 30 +; CHECK-RV64-NEXT: bltz a1, .LBB61_560 +; CHECK-RV64-NEXT: j .LBB61_34 +; CHECK-RV64-NEXT: .LBB61_560: # %cond.load129 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 34 +; CHECK-RV64-NEXT: li a3, 33 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 29 +; CHECK-RV64-NEXT: bltz a1, .LBB61_561 +; CHECK-RV64-NEXT: j .LBB61_35 +; CHECK-RV64-NEXT: .LBB61_561: # %cond.load133 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 35 +; CHECK-RV64-NEXT: li a3, 34 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 28 +; CHECK-RV64-NEXT: bltz a1, .LBB61_562 +; CHECK-RV64-NEXT: j .LBB61_36 +; CHECK-RV64-NEXT: .LBB61_562: # %cond.load137 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 36 +; CHECK-RV64-NEXT: li a3, 35 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 27 +; CHECK-RV64-NEXT: bltz a1, .LBB61_563 +; CHECK-RV64-NEXT: j .LBB61_37 +; CHECK-RV64-NEXT: .LBB61_563: # %cond.load141 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 37 +; CHECK-RV64-NEXT: li a3, 36 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 26 +; CHECK-RV64-NEXT: bltz a1, .LBB61_564 +; CHECK-RV64-NEXT: j .LBB61_38 +; CHECK-RV64-NEXT: .LBB61_564: # %cond.load145 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 38 +; CHECK-RV64-NEXT: li a3, 37 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 25 +; CHECK-RV64-NEXT: bltz a1, .LBB61_565 +; CHECK-RV64-NEXT: j .LBB61_39 +; CHECK-RV64-NEXT: .LBB61_565: # %cond.load149 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 39 +; CHECK-RV64-NEXT: li a3, 38 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 24 +; CHECK-RV64-NEXT: bltz a1, .LBB61_566 +; CHECK-RV64-NEXT: j .LBB61_40 +; CHECK-RV64-NEXT: .LBB61_566: # %cond.load153 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 40 +; CHECK-RV64-NEXT: li a3, 39 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 23 +; CHECK-RV64-NEXT: bltz a1, .LBB61_567 +; CHECK-RV64-NEXT: j .LBB61_41 +; CHECK-RV64-NEXT: .LBB61_567: # %cond.load157 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 41 +; CHECK-RV64-NEXT: li a3, 40 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 22 +; CHECK-RV64-NEXT: bltz a1, .LBB61_568 +; CHECK-RV64-NEXT: j .LBB61_42 +; CHECK-RV64-NEXT: .LBB61_568: # %cond.load161 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 42 +; CHECK-RV64-NEXT: li a3, 41 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 21 +; CHECK-RV64-NEXT: bltz a1, .LBB61_569 +; CHECK-RV64-NEXT: j .LBB61_43 +; CHECK-RV64-NEXT: .LBB61_569: # %cond.load165 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 43 +; CHECK-RV64-NEXT: li a3, 42 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 20 +; CHECK-RV64-NEXT: bltz a1, .LBB61_570 +; CHECK-RV64-NEXT: j .LBB61_44 +; CHECK-RV64-NEXT: .LBB61_570: # %cond.load169 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 44 +; CHECK-RV64-NEXT: li a3, 43 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 19 +; CHECK-RV64-NEXT: bltz a1, .LBB61_571 +; CHECK-RV64-NEXT: j .LBB61_45 +; CHECK-RV64-NEXT: .LBB61_571: # %cond.load173 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 45 +; CHECK-RV64-NEXT: li a3, 44 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 18 +; CHECK-RV64-NEXT: bltz a1, .LBB61_572 +; CHECK-RV64-NEXT: j .LBB61_46 +; CHECK-RV64-NEXT: .LBB61_572: # %cond.load177 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 46 +; CHECK-RV64-NEXT: li a3, 45 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 17 +; CHECK-RV64-NEXT: bltz a1, .LBB61_573 +; CHECK-RV64-NEXT: j .LBB61_47 +; CHECK-RV64-NEXT: .LBB61_573: # %cond.load181 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 47 +; CHECK-RV64-NEXT: li a3, 46 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 16 +; CHECK-RV64-NEXT: bltz a1, .LBB61_574 +; CHECK-RV64-NEXT: j .LBB61_48 +; CHECK-RV64-NEXT: .LBB61_574: # %cond.load185 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 48 +; CHECK-RV64-NEXT: li a3, 47 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 15 +; CHECK-RV64-NEXT: bltz a1, .LBB61_575 +; CHECK-RV64-NEXT: j .LBB61_49 +; CHECK-RV64-NEXT: .LBB61_575: # %cond.load189 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 49 +; CHECK-RV64-NEXT: li a3, 48 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 14 +; CHECK-RV64-NEXT: bltz a1, .LBB61_576 +; CHECK-RV64-NEXT: j .LBB61_50 +; CHECK-RV64-NEXT: .LBB61_576: # %cond.load193 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 50 +; CHECK-RV64-NEXT: li a3, 49 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 13 +; CHECK-RV64-NEXT: bltz a1, .LBB61_577 +; CHECK-RV64-NEXT: j .LBB61_51 +; CHECK-RV64-NEXT: .LBB61_577: # %cond.load197 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 51 +; CHECK-RV64-NEXT: li a3, 50 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 12 +; CHECK-RV64-NEXT: bltz a1, .LBB61_578 +; CHECK-RV64-NEXT: j .LBB61_52 +; CHECK-RV64-NEXT: .LBB61_578: # %cond.load201 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 52 +; CHECK-RV64-NEXT: li a3, 51 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 11 +; CHECK-RV64-NEXT: bltz a1, .LBB61_579 +; CHECK-RV64-NEXT: j .LBB61_53 +; CHECK-RV64-NEXT: .LBB61_579: # %cond.load205 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 53 +; CHECK-RV64-NEXT: li a3, 52 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 10 +; CHECK-RV64-NEXT: bltz a1, .LBB61_580 +; CHECK-RV64-NEXT: j .LBB61_54 +; CHECK-RV64-NEXT: .LBB61_580: # %cond.load209 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 54 +; CHECK-RV64-NEXT: li a3, 53 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 9 +; CHECK-RV64-NEXT: bltz a1, .LBB61_581 +; CHECK-RV64-NEXT: j .LBB61_55 +; CHECK-RV64-NEXT: .LBB61_581: # %cond.load213 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 55 +; CHECK-RV64-NEXT: li a3, 54 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 8 +; CHECK-RV64-NEXT: bltz a1, .LBB61_582 +; CHECK-RV64-NEXT: j .LBB61_56 +; CHECK-RV64-NEXT: .LBB61_582: # %cond.load217 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 56 +; CHECK-RV64-NEXT: li a3, 55 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 7 +; CHECK-RV64-NEXT: bltz a1, .LBB61_583 +; CHECK-RV64-NEXT: j .LBB61_57 +; CHECK-RV64-NEXT: .LBB61_583: # %cond.load221 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 57 +; CHECK-RV64-NEXT: li a3, 56 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 6 +; CHECK-RV64-NEXT: bltz a1, .LBB61_584 +; CHECK-RV64-NEXT: j .LBB61_58 +; CHECK-RV64-NEXT: .LBB61_584: # %cond.load225 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 58 +; CHECK-RV64-NEXT: li a3, 57 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 5 +; CHECK-RV64-NEXT: bltz a1, .LBB61_585 +; CHECK-RV64-NEXT: j .LBB61_59 +; CHECK-RV64-NEXT: .LBB61_585: # %cond.load229 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 59 +; CHECK-RV64-NEXT: li a3, 58 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 4 +; CHECK-RV64-NEXT: bltz a1, .LBB61_586 +; CHECK-RV64-NEXT: j .LBB61_60 +; CHECK-RV64-NEXT: .LBB61_586: # %cond.load233 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 60 +; CHECK-RV64-NEXT: li a3, 59 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 3 +; CHECK-RV64-NEXT: bltz a1, .LBB61_587 +; CHECK-RV64-NEXT: j .LBB61_61 +; CHECK-RV64-NEXT: .LBB61_587: # %cond.load237 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 61 +; CHECK-RV64-NEXT: li a3, 60 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a1, a2, 2 +; CHECK-RV64-NEXT: bgez a1, .LBB61_1025 +; CHECK-RV64-NEXT: j .LBB61_62 +; CHECK-RV64-NEXT: .LBB61_1025: # %cond.load237 +; CHECK-RV64-NEXT: j .LBB61_63 +; CHECK-RV64-NEXT: .LBB61_588: # %cond.load249 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 64 +; CHECK-RV64-NEXT: li a3, 63 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m1, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv1r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 1 +; CHECK-RV64-NEXT: bnez a2, .LBB61_589 +; CHECK-RV64-NEXT: j .LBB61_67 +; CHECK-RV64-NEXT: .LBB61_589: # %cond.load253 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 65 +; CHECK-RV64-NEXT: li a3, 64 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 2 +; CHECK-RV64-NEXT: bnez a2, .LBB61_590 +; CHECK-RV64-NEXT: j .LBB61_68 +; CHECK-RV64-NEXT: .LBB61_590: # %cond.load257 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 66 +; CHECK-RV64-NEXT: li a3, 65 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 4 +; CHECK-RV64-NEXT: bnez a2, .LBB61_591 +; CHECK-RV64-NEXT: j .LBB61_69 +; CHECK-RV64-NEXT: .LBB61_591: # %cond.load261 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 67 +; CHECK-RV64-NEXT: li a3, 66 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 8 +; CHECK-RV64-NEXT: bnez a2, .LBB61_592 +; CHECK-RV64-NEXT: j .LBB61_70 +; CHECK-RV64-NEXT: .LBB61_592: # %cond.load265 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 68 +; CHECK-RV64-NEXT: li a3, 67 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 16 +; CHECK-RV64-NEXT: bnez a2, .LBB61_593 +; CHECK-RV64-NEXT: j .LBB61_71 +; CHECK-RV64-NEXT: .LBB61_593: # %cond.load269 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 69 +; CHECK-RV64-NEXT: li a3, 68 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 32 +; CHECK-RV64-NEXT: bnez a2, .LBB61_594 +; CHECK-RV64-NEXT: j .LBB61_72 +; CHECK-RV64-NEXT: .LBB61_594: # %cond.load273 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 70 +; CHECK-RV64-NEXT: li a3, 69 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 64 +; CHECK-RV64-NEXT: bnez a2, .LBB61_595 +; CHECK-RV64-NEXT: j .LBB61_73 +; CHECK-RV64-NEXT: .LBB61_595: # %cond.load277 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 71 +; CHECK-RV64-NEXT: li a3, 70 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 128 +; CHECK-RV64-NEXT: bnez a2, .LBB61_596 +; CHECK-RV64-NEXT: j .LBB61_74 +; CHECK-RV64-NEXT: .LBB61_596: # %cond.load281 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 72 +; CHECK-RV64-NEXT: li a3, 71 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 256 +; CHECK-RV64-NEXT: bnez a2, .LBB61_597 +; CHECK-RV64-NEXT: j .LBB61_75 +; CHECK-RV64-NEXT: .LBB61_597: # %cond.load285 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 73 +; CHECK-RV64-NEXT: li a3, 72 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 512 +; CHECK-RV64-NEXT: bnez a2, .LBB61_598 +; CHECK-RV64-NEXT: j .LBB61_76 +; CHECK-RV64-NEXT: .LBB61_598: # %cond.load289 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 74 +; CHECK-RV64-NEXT: li a3, 73 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a2, a1, 1024 +; CHECK-RV64-NEXT: bnez a2, .LBB61_599 +; CHECK-RV64-NEXT: j .LBB61_77 +; CHECK-RV64-NEXT: .LBB61_599: # %cond.load293 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 75 +; CHECK-RV64-NEXT: li a3, 74 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 52 +; CHECK-RV64-NEXT: bltz a2, .LBB61_600 +; CHECK-RV64-NEXT: j .LBB61_78 +; CHECK-RV64-NEXT: .LBB61_600: # %cond.load297 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 76 +; CHECK-RV64-NEXT: li a3, 75 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 51 +; CHECK-RV64-NEXT: bltz a2, .LBB61_601 +; CHECK-RV64-NEXT: j .LBB61_79 +; CHECK-RV64-NEXT: .LBB61_601: # %cond.load301 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 77 +; CHECK-RV64-NEXT: li a3, 76 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 50 +; CHECK-RV64-NEXT: bltz a2, .LBB61_602 +; CHECK-RV64-NEXT: j .LBB61_80 +; CHECK-RV64-NEXT: .LBB61_602: # %cond.load305 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 78 +; CHECK-RV64-NEXT: li a3, 77 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 49 +; CHECK-RV64-NEXT: bltz a2, .LBB61_603 +; CHECK-RV64-NEXT: j .LBB61_81 +; CHECK-RV64-NEXT: .LBB61_603: # %cond.load309 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 79 +; CHECK-RV64-NEXT: li a3, 78 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 48 +; CHECK-RV64-NEXT: bltz a2, .LBB61_604 +; CHECK-RV64-NEXT: j .LBB61_82 +; CHECK-RV64-NEXT: .LBB61_604: # %cond.load313 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 80 +; CHECK-RV64-NEXT: li a3, 79 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 47 +; CHECK-RV64-NEXT: bltz a2, .LBB61_605 +; CHECK-RV64-NEXT: j .LBB61_83 +; CHECK-RV64-NEXT: .LBB61_605: # %cond.load317 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 81 +; CHECK-RV64-NEXT: li a3, 80 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 46 +; CHECK-RV64-NEXT: bltz a2, .LBB61_606 +; CHECK-RV64-NEXT: j .LBB61_84 +; CHECK-RV64-NEXT: .LBB61_606: # %cond.load321 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 82 +; CHECK-RV64-NEXT: li a3, 81 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 45 +; CHECK-RV64-NEXT: bltz a2, .LBB61_607 +; CHECK-RV64-NEXT: j .LBB61_85 +; CHECK-RV64-NEXT: .LBB61_607: # %cond.load325 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 83 +; CHECK-RV64-NEXT: li a3, 82 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 44 +; CHECK-RV64-NEXT: bltz a2, .LBB61_608 +; CHECK-RV64-NEXT: j .LBB61_86 +; CHECK-RV64-NEXT: .LBB61_608: # %cond.load329 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 84 +; CHECK-RV64-NEXT: li a3, 83 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 43 +; CHECK-RV64-NEXT: bltz a2, .LBB61_609 +; CHECK-RV64-NEXT: j .LBB61_87 +; CHECK-RV64-NEXT: .LBB61_609: # %cond.load333 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 85 +; CHECK-RV64-NEXT: li a3, 84 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 42 +; CHECK-RV64-NEXT: bltz a2, .LBB61_610 +; CHECK-RV64-NEXT: j .LBB61_88 +; CHECK-RV64-NEXT: .LBB61_610: # %cond.load337 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 86 +; CHECK-RV64-NEXT: li a3, 85 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 41 +; CHECK-RV64-NEXT: bltz a2, .LBB61_611 +; CHECK-RV64-NEXT: j .LBB61_89 +; CHECK-RV64-NEXT: .LBB61_611: # %cond.load341 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 87 +; CHECK-RV64-NEXT: li a3, 86 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 40 +; CHECK-RV64-NEXT: bltz a2, .LBB61_612 +; CHECK-RV64-NEXT: j .LBB61_90 +; CHECK-RV64-NEXT: .LBB61_612: # %cond.load345 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 88 +; CHECK-RV64-NEXT: li a3, 87 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 39 +; CHECK-RV64-NEXT: bltz a2, .LBB61_613 +; CHECK-RV64-NEXT: j .LBB61_91 +; CHECK-RV64-NEXT: .LBB61_613: # %cond.load349 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 89 +; CHECK-RV64-NEXT: li a3, 88 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 38 +; CHECK-RV64-NEXT: bltz a2, .LBB61_614 +; CHECK-RV64-NEXT: j .LBB61_92 +; CHECK-RV64-NEXT: .LBB61_614: # %cond.load353 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 90 +; CHECK-RV64-NEXT: li a3, 89 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 37 +; CHECK-RV64-NEXT: bltz a2, .LBB61_615 +; CHECK-RV64-NEXT: j .LBB61_93 +; CHECK-RV64-NEXT: .LBB61_615: # %cond.load357 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 91 +; CHECK-RV64-NEXT: li a3, 90 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 36 +; CHECK-RV64-NEXT: bltz a2, .LBB61_616 +; CHECK-RV64-NEXT: j .LBB61_94 +; CHECK-RV64-NEXT: .LBB61_616: # %cond.load361 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 92 +; CHECK-RV64-NEXT: li a3, 91 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 35 +; CHECK-RV64-NEXT: bltz a2, .LBB61_617 +; CHECK-RV64-NEXT: j .LBB61_95 +; CHECK-RV64-NEXT: .LBB61_617: # %cond.load365 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 93 +; CHECK-RV64-NEXT: li a3, 92 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 34 +; CHECK-RV64-NEXT: bltz a2, .LBB61_618 +; CHECK-RV64-NEXT: j .LBB61_96 +; CHECK-RV64-NEXT: .LBB61_618: # %cond.load369 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 94 +; CHECK-RV64-NEXT: li a3, 93 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 33 +; CHECK-RV64-NEXT: bltz a2, .LBB61_619 +; CHECK-RV64-NEXT: j .LBB61_97 +; CHECK-RV64-NEXT: .LBB61_619: # %cond.load373 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 95 +; CHECK-RV64-NEXT: li a3, 94 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 32 +; CHECK-RV64-NEXT: bltz a2, .LBB61_620 +; CHECK-RV64-NEXT: j .LBB61_98 +; CHECK-RV64-NEXT: .LBB61_620: # %cond.load377 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 96 +; CHECK-RV64-NEXT: li a3, 95 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 31 +; CHECK-RV64-NEXT: bltz a2, .LBB61_621 +; CHECK-RV64-NEXT: j .LBB61_99 +; CHECK-RV64-NEXT: .LBB61_621: # %cond.load381 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 97 +; CHECK-RV64-NEXT: li a3, 96 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 30 +; CHECK-RV64-NEXT: bltz a2, .LBB61_622 +; CHECK-RV64-NEXT: j .LBB61_100 +; CHECK-RV64-NEXT: .LBB61_622: # %cond.load385 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 98 +; CHECK-RV64-NEXT: li a3, 97 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 29 +; CHECK-RV64-NEXT: bltz a2, .LBB61_623 +; CHECK-RV64-NEXT: j .LBB61_101 +; CHECK-RV64-NEXT: .LBB61_623: # %cond.load389 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 99 +; CHECK-RV64-NEXT: li a3, 98 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 28 +; CHECK-RV64-NEXT: bltz a2, .LBB61_624 +; CHECK-RV64-NEXT: j .LBB61_102 +; CHECK-RV64-NEXT: .LBB61_624: # %cond.load393 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 100 +; CHECK-RV64-NEXT: li a3, 99 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 27 +; CHECK-RV64-NEXT: bltz a2, .LBB61_625 +; CHECK-RV64-NEXT: j .LBB61_103 +; CHECK-RV64-NEXT: .LBB61_625: # %cond.load397 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 101 +; CHECK-RV64-NEXT: li a3, 100 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 26 +; CHECK-RV64-NEXT: bltz a2, .LBB61_626 +; CHECK-RV64-NEXT: j .LBB61_104 +; CHECK-RV64-NEXT: .LBB61_626: # %cond.load401 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 102 +; CHECK-RV64-NEXT: li a3, 101 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 25 +; CHECK-RV64-NEXT: bltz a2, .LBB61_627 +; CHECK-RV64-NEXT: j .LBB61_105 +; CHECK-RV64-NEXT: .LBB61_627: # %cond.load405 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 103 +; CHECK-RV64-NEXT: li a3, 102 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 24 +; CHECK-RV64-NEXT: bltz a2, .LBB61_628 +; CHECK-RV64-NEXT: j .LBB61_106 +; CHECK-RV64-NEXT: .LBB61_628: # %cond.load409 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 104 +; CHECK-RV64-NEXT: li a3, 103 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 23 +; CHECK-RV64-NEXT: bltz a2, .LBB61_629 +; CHECK-RV64-NEXT: j .LBB61_107 +; CHECK-RV64-NEXT: .LBB61_629: # %cond.load413 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 105 +; CHECK-RV64-NEXT: li a3, 104 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 22 +; CHECK-RV64-NEXT: bltz a2, .LBB61_630 +; CHECK-RV64-NEXT: j .LBB61_108 +; CHECK-RV64-NEXT: .LBB61_630: # %cond.load417 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 106 +; CHECK-RV64-NEXT: li a3, 105 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 21 +; CHECK-RV64-NEXT: bltz a2, .LBB61_631 +; CHECK-RV64-NEXT: j .LBB61_109 +; CHECK-RV64-NEXT: .LBB61_631: # %cond.load421 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 107 +; CHECK-RV64-NEXT: li a3, 106 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 20 +; CHECK-RV64-NEXT: bltz a2, .LBB61_632 +; CHECK-RV64-NEXT: j .LBB61_110 +; CHECK-RV64-NEXT: .LBB61_632: # %cond.load425 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 108 +; CHECK-RV64-NEXT: li a3, 107 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 19 +; CHECK-RV64-NEXT: bltz a2, .LBB61_633 +; CHECK-RV64-NEXT: j .LBB61_111 +; CHECK-RV64-NEXT: .LBB61_633: # %cond.load429 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 109 +; CHECK-RV64-NEXT: li a3, 108 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 18 +; CHECK-RV64-NEXT: bltz a2, .LBB61_634 +; CHECK-RV64-NEXT: j .LBB61_112 +; CHECK-RV64-NEXT: .LBB61_634: # %cond.load433 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 110 +; CHECK-RV64-NEXT: li a3, 109 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 17 +; CHECK-RV64-NEXT: bltz a2, .LBB61_635 +; CHECK-RV64-NEXT: j .LBB61_113 +; CHECK-RV64-NEXT: .LBB61_635: # %cond.load437 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 111 +; CHECK-RV64-NEXT: li a3, 110 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 16 +; CHECK-RV64-NEXT: bltz a2, .LBB61_636 +; CHECK-RV64-NEXT: j .LBB61_114 +; CHECK-RV64-NEXT: .LBB61_636: # %cond.load441 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 112 +; CHECK-RV64-NEXT: li a3, 111 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 15 +; CHECK-RV64-NEXT: bltz a2, .LBB61_637 +; CHECK-RV64-NEXT: j .LBB61_115 +; CHECK-RV64-NEXT: .LBB61_637: # %cond.load445 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 113 +; CHECK-RV64-NEXT: li a3, 112 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 14 +; CHECK-RV64-NEXT: bltz a2, .LBB61_638 +; CHECK-RV64-NEXT: j .LBB61_116 +; CHECK-RV64-NEXT: .LBB61_638: # %cond.load449 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 114 +; CHECK-RV64-NEXT: li a3, 113 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 13 +; CHECK-RV64-NEXT: bltz a2, .LBB61_639 +; CHECK-RV64-NEXT: j .LBB61_117 +; CHECK-RV64-NEXT: .LBB61_639: # %cond.load453 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 115 +; CHECK-RV64-NEXT: li a3, 114 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 12 +; CHECK-RV64-NEXT: bltz a2, .LBB61_640 +; CHECK-RV64-NEXT: j .LBB61_118 +; CHECK-RV64-NEXT: .LBB61_640: # %cond.load457 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 116 +; CHECK-RV64-NEXT: li a3, 115 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 11 +; CHECK-RV64-NEXT: bltz a2, .LBB61_641 +; CHECK-RV64-NEXT: j .LBB61_119 +; CHECK-RV64-NEXT: .LBB61_641: # %cond.load461 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 117 +; CHECK-RV64-NEXT: li a3, 116 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 10 +; CHECK-RV64-NEXT: bltz a2, .LBB61_642 +; CHECK-RV64-NEXT: j .LBB61_120 +; CHECK-RV64-NEXT: .LBB61_642: # %cond.load465 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 118 +; CHECK-RV64-NEXT: li a3, 117 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 9 +; CHECK-RV64-NEXT: bltz a2, .LBB61_643 +; CHECK-RV64-NEXT: j .LBB61_121 +; CHECK-RV64-NEXT: .LBB61_643: # %cond.load469 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 119 +; CHECK-RV64-NEXT: li a3, 118 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 8 +; CHECK-RV64-NEXT: bltz a2, .LBB61_644 +; CHECK-RV64-NEXT: j .LBB61_122 +; CHECK-RV64-NEXT: .LBB61_644: # %cond.load473 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 120 +; CHECK-RV64-NEXT: li a3, 119 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 7 +; CHECK-RV64-NEXT: bltz a2, .LBB61_645 +; CHECK-RV64-NEXT: j .LBB61_123 +; CHECK-RV64-NEXT: .LBB61_645: # %cond.load477 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 121 +; CHECK-RV64-NEXT: li a3, 120 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 6 +; CHECK-RV64-NEXT: bltz a2, .LBB61_646 +; CHECK-RV64-NEXT: j .LBB61_124 +; CHECK-RV64-NEXT: .LBB61_646: # %cond.load481 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 122 +; CHECK-RV64-NEXT: li a3, 121 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 5 +; CHECK-RV64-NEXT: bltz a2, .LBB61_647 +; CHECK-RV64-NEXT: j .LBB61_125 +; CHECK-RV64-NEXT: .LBB61_647: # %cond.load485 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 123 +; CHECK-RV64-NEXT: li a3, 122 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 4 +; CHECK-RV64-NEXT: bltz a2, .LBB61_648 +; CHECK-RV64-NEXT: j .LBB61_126 +; CHECK-RV64-NEXT: .LBB61_648: # %cond.load489 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 124 +; CHECK-RV64-NEXT: li a3, 123 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 3 +; CHECK-RV64-NEXT: bltz a2, .LBB61_649 +; CHECK-RV64-NEXT: j .LBB61_127 +; CHECK-RV64-NEXT: .LBB61_649: # %cond.load493 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v24, a2 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a2, 125 +; CHECK-RV64-NEXT: li a3, 124 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: slli a2, a1, 2 +; CHECK-RV64-NEXT: bgez a2, .LBB61_1026 +; CHECK-RV64-NEXT: j .LBB61_128 +; CHECK-RV64-NEXT: .LBB61_1026: # %cond.load493 +; CHECK-RV64-NEXT: j .LBB61_129 +; CHECK-RV64-NEXT: .LBB61_650: # %cond.load505 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v24, a1 +; CHECK-RV64-NEXT: vmv8r.v v16, v8 +; CHECK-RV64-NEXT: li a1, 128 +; CHECK-RV64-NEXT: li a3, 127 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m2, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v24, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv2r.v v16, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v16 +; CHECK-RV64-NEXT: andi a1, a2, 1 +; CHECK-RV64-NEXT: bnez a1, .LBB61_651 +; CHECK-RV64-NEXT: j .LBB61_133 +; CHECK-RV64-NEXT: .LBB61_651: # %cond.load509 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 129 +; CHECK-RV64-NEXT: li a3, 128 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 2 +; CHECK-RV64-NEXT: bnez a1, .LBB61_652 +; CHECK-RV64-NEXT: j .LBB61_134 +; CHECK-RV64-NEXT: .LBB61_652: # %cond.load513 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 130 +; CHECK-RV64-NEXT: li a3, 129 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 4 +; CHECK-RV64-NEXT: bnez a1, .LBB61_653 +; CHECK-RV64-NEXT: j .LBB61_135 +; CHECK-RV64-NEXT: .LBB61_653: # %cond.load517 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 131 +; CHECK-RV64-NEXT: li a3, 130 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 8 +; CHECK-RV64-NEXT: bnez a1, .LBB61_654 +; CHECK-RV64-NEXT: j .LBB61_136 +; CHECK-RV64-NEXT: .LBB61_654: # %cond.load521 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 132 +; CHECK-RV64-NEXT: li a3, 131 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 16 +; CHECK-RV64-NEXT: bnez a1, .LBB61_655 +; CHECK-RV64-NEXT: j .LBB61_137 +; CHECK-RV64-NEXT: .LBB61_655: # %cond.load525 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 133 +; CHECK-RV64-NEXT: li a3, 132 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 32 +; CHECK-RV64-NEXT: bnez a1, .LBB61_656 +; CHECK-RV64-NEXT: j .LBB61_138 +; CHECK-RV64-NEXT: .LBB61_656: # %cond.load529 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 134 +; CHECK-RV64-NEXT: li a3, 133 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 64 +; CHECK-RV64-NEXT: bnez a1, .LBB61_657 +; CHECK-RV64-NEXT: j .LBB61_139 +; CHECK-RV64-NEXT: .LBB61_657: # %cond.load533 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 135 +; CHECK-RV64-NEXT: li a3, 134 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 128 +; CHECK-RV64-NEXT: bnez a1, .LBB61_658 +; CHECK-RV64-NEXT: j .LBB61_140 +; CHECK-RV64-NEXT: .LBB61_658: # %cond.load537 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 136 +; CHECK-RV64-NEXT: li a3, 135 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 256 +; CHECK-RV64-NEXT: bnez a1, .LBB61_659 +; CHECK-RV64-NEXT: j .LBB61_141 +; CHECK-RV64-NEXT: .LBB61_659: # %cond.load541 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 137 +; CHECK-RV64-NEXT: li a3, 136 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 512 +; CHECK-RV64-NEXT: bnez a1, .LBB61_660 +; CHECK-RV64-NEXT: j .LBB61_142 +; CHECK-RV64-NEXT: .LBB61_660: # %cond.load545 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 138 +; CHECK-RV64-NEXT: li a3, 137 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 1024 +; CHECK-RV64-NEXT: bnez a1, .LBB61_661 +; CHECK-RV64-NEXT: j .LBB61_143 +; CHECK-RV64-NEXT: .LBB61_661: # %cond.load549 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 139 +; CHECK-RV64-NEXT: li a3, 138 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 52 +; CHECK-RV64-NEXT: bltz a1, .LBB61_662 +; CHECK-RV64-NEXT: j .LBB61_144 +; CHECK-RV64-NEXT: .LBB61_662: # %cond.load553 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 140 +; CHECK-RV64-NEXT: li a3, 139 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 51 +; CHECK-RV64-NEXT: bltz a1, .LBB61_663 +; CHECK-RV64-NEXT: j .LBB61_145 +; CHECK-RV64-NEXT: .LBB61_663: # %cond.load557 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 141 +; CHECK-RV64-NEXT: li a3, 140 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 50 +; CHECK-RV64-NEXT: bltz a1, .LBB61_664 +; CHECK-RV64-NEXT: j .LBB61_146 +; CHECK-RV64-NEXT: .LBB61_664: # %cond.load561 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 142 +; CHECK-RV64-NEXT: li a3, 141 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 49 +; CHECK-RV64-NEXT: bltz a1, .LBB61_665 +; CHECK-RV64-NEXT: j .LBB61_147 +; CHECK-RV64-NEXT: .LBB61_665: # %cond.load565 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 143 +; CHECK-RV64-NEXT: li a3, 142 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 48 +; CHECK-RV64-NEXT: bltz a1, .LBB61_666 +; CHECK-RV64-NEXT: j .LBB61_148 +; CHECK-RV64-NEXT: .LBB61_666: # %cond.load569 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 144 +; CHECK-RV64-NEXT: li a3, 143 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 47 +; CHECK-RV64-NEXT: bltz a1, .LBB61_667 +; CHECK-RV64-NEXT: j .LBB61_149 +; CHECK-RV64-NEXT: .LBB61_667: # %cond.load573 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 145 +; CHECK-RV64-NEXT: li a3, 144 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 46 +; CHECK-RV64-NEXT: bltz a1, .LBB61_668 +; CHECK-RV64-NEXT: j .LBB61_150 +; CHECK-RV64-NEXT: .LBB61_668: # %cond.load577 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 146 +; CHECK-RV64-NEXT: li a3, 145 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 45 +; CHECK-RV64-NEXT: bltz a1, .LBB61_669 +; CHECK-RV64-NEXT: j .LBB61_151 +; CHECK-RV64-NEXT: .LBB61_669: # %cond.load581 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 147 +; CHECK-RV64-NEXT: li a3, 146 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 44 +; CHECK-RV64-NEXT: bltz a1, .LBB61_670 +; CHECK-RV64-NEXT: j .LBB61_152 +; CHECK-RV64-NEXT: .LBB61_670: # %cond.load585 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 148 +; CHECK-RV64-NEXT: li a3, 147 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 43 +; CHECK-RV64-NEXT: bltz a1, .LBB61_671 +; CHECK-RV64-NEXT: j .LBB61_153 +; CHECK-RV64-NEXT: .LBB61_671: # %cond.load589 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 149 +; CHECK-RV64-NEXT: li a3, 148 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 42 +; CHECK-RV64-NEXT: bltz a1, .LBB61_672 +; CHECK-RV64-NEXT: j .LBB61_154 +; CHECK-RV64-NEXT: .LBB61_672: # %cond.load593 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 150 +; CHECK-RV64-NEXT: li a3, 149 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 41 +; CHECK-RV64-NEXT: bltz a1, .LBB61_673 +; CHECK-RV64-NEXT: j .LBB61_155 +; CHECK-RV64-NEXT: .LBB61_673: # %cond.load597 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 151 +; CHECK-RV64-NEXT: li a3, 150 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 40 +; CHECK-RV64-NEXT: bltz a1, .LBB61_674 +; CHECK-RV64-NEXT: j .LBB61_156 +; CHECK-RV64-NEXT: .LBB61_674: # %cond.load601 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 152 +; CHECK-RV64-NEXT: li a3, 151 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 39 +; CHECK-RV64-NEXT: bltz a1, .LBB61_675 +; CHECK-RV64-NEXT: j .LBB61_157 +; CHECK-RV64-NEXT: .LBB61_675: # %cond.load605 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 153 +; CHECK-RV64-NEXT: li a3, 152 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 38 +; CHECK-RV64-NEXT: bltz a1, .LBB61_676 +; CHECK-RV64-NEXT: j .LBB61_158 +; CHECK-RV64-NEXT: .LBB61_676: # %cond.load609 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 154 +; CHECK-RV64-NEXT: li a3, 153 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 37 +; CHECK-RV64-NEXT: bltz a1, .LBB61_677 +; CHECK-RV64-NEXT: j .LBB61_159 +; CHECK-RV64-NEXT: .LBB61_677: # %cond.load613 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 155 +; CHECK-RV64-NEXT: li a3, 154 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 36 +; CHECK-RV64-NEXT: bltz a1, .LBB61_678 +; CHECK-RV64-NEXT: j .LBB61_160 +; CHECK-RV64-NEXT: .LBB61_678: # %cond.load617 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 156 +; CHECK-RV64-NEXT: li a3, 155 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 35 +; CHECK-RV64-NEXT: bltz a1, .LBB61_679 +; CHECK-RV64-NEXT: j .LBB61_161 +; CHECK-RV64-NEXT: .LBB61_679: # %cond.load621 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 157 +; CHECK-RV64-NEXT: li a3, 156 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 34 +; CHECK-RV64-NEXT: bltz a1, .LBB61_680 +; CHECK-RV64-NEXT: j .LBB61_162 +; CHECK-RV64-NEXT: .LBB61_680: # %cond.load625 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 158 +; CHECK-RV64-NEXT: li a3, 157 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 33 +; CHECK-RV64-NEXT: bltz a1, .LBB61_681 +; CHECK-RV64-NEXT: j .LBB61_163 +; CHECK-RV64-NEXT: .LBB61_681: # %cond.load629 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 159 +; CHECK-RV64-NEXT: li a3, 158 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 32 +; CHECK-RV64-NEXT: bltz a1, .LBB61_682 +; CHECK-RV64-NEXT: j .LBB61_164 +; CHECK-RV64-NEXT: .LBB61_682: # %cond.load633 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 160 +; CHECK-RV64-NEXT: li a3, 159 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 31 +; CHECK-RV64-NEXT: bltz a1, .LBB61_683 +; CHECK-RV64-NEXT: j .LBB61_165 +; CHECK-RV64-NEXT: .LBB61_683: # %cond.load637 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 161 +; CHECK-RV64-NEXT: li a3, 160 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 30 +; CHECK-RV64-NEXT: bltz a1, .LBB61_684 +; CHECK-RV64-NEXT: j .LBB61_166 +; CHECK-RV64-NEXT: .LBB61_684: # %cond.load641 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 162 +; CHECK-RV64-NEXT: li a3, 161 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 29 +; CHECK-RV64-NEXT: bltz a1, .LBB61_685 +; CHECK-RV64-NEXT: j .LBB61_167 +; CHECK-RV64-NEXT: .LBB61_685: # %cond.load645 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 163 +; CHECK-RV64-NEXT: li a3, 162 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 28 +; CHECK-RV64-NEXT: bltz a1, .LBB61_686 +; CHECK-RV64-NEXT: j .LBB61_168 +; CHECK-RV64-NEXT: .LBB61_686: # %cond.load649 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 164 +; CHECK-RV64-NEXT: li a3, 163 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 27 +; CHECK-RV64-NEXT: bltz a1, .LBB61_687 +; CHECK-RV64-NEXT: j .LBB61_169 +; CHECK-RV64-NEXT: .LBB61_687: # %cond.load653 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 165 +; CHECK-RV64-NEXT: li a3, 164 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 26 +; CHECK-RV64-NEXT: bltz a1, .LBB61_688 +; CHECK-RV64-NEXT: j .LBB61_170 +; CHECK-RV64-NEXT: .LBB61_688: # %cond.load657 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 166 +; CHECK-RV64-NEXT: li a3, 165 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 25 +; CHECK-RV64-NEXT: bltz a1, .LBB61_689 +; CHECK-RV64-NEXT: j .LBB61_171 +; CHECK-RV64-NEXT: .LBB61_689: # %cond.load661 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 167 +; CHECK-RV64-NEXT: li a3, 166 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 24 +; CHECK-RV64-NEXT: bltz a1, .LBB61_690 +; CHECK-RV64-NEXT: j .LBB61_172 +; CHECK-RV64-NEXT: .LBB61_690: # %cond.load665 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 168 +; CHECK-RV64-NEXT: li a3, 167 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 23 +; CHECK-RV64-NEXT: bltz a1, .LBB61_691 +; CHECK-RV64-NEXT: j .LBB61_173 +; CHECK-RV64-NEXT: .LBB61_691: # %cond.load669 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 169 +; CHECK-RV64-NEXT: li a3, 168 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 22 +; CHECK-RV64-NEXT: bltz a1, .LBB61_692 +; CHECK-RV64-NEXT: j .LBB61_174 +; CHECK-RV64-NEXT: .LBB61_692: # %cond.load673 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 170 +; CHECK-RV64-NEXT: li a3, 169 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 21 +; CHECK-RV64-NEXT: bltz a1, .LBB61_693 +; CHECK-RV64-NEXT: j .LBB61_175 +; CHECK-RV64-NEXT: .LBB61_693: # %cond.load677 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 171 +; CHECK-RV64-NEXT: li a3, 170 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 20 +; CHECK-RV64-NEXT: bltz a1, .LBB61_694 +; CHECK-RV64-NEXT: j .LBB61_176 +; CHECK-RV64-NEXT: .LBB61_694: # %cond.load681 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 172 +; CHECK-RV64-NEXT: li a3, 171 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 19 +; CHECK-RV64-NEXT: bltz a1, .LBB61_695 +; CHECK-RV64-NEXT: j .LBB61_177 +; CHECK-RV64-NEXT: .LBB61_695: # %cond.load685 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 173 +; CHECK-RV64-NEXT: li a3, 172 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 18 +; CHECK-RV64-NEXT: bltz a1, .LBB61_696 +; CHECK-RV64-NEXT: j .LBB61_178 +; CHECK-RV64-NEXT: .LBB61_696: # %cond.load689 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 174 +; CHECK-RV64-NEXT: li a3, 173 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 17 +; CHECK-RV64-NEXT: bltz a1, .LBB61_697 +; CHECK-RV64-NEXT: j .LBB61_179 +; CHECK-RV64-NEXT: .LBB61_697: # %cond.load693 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 175 +; CHECK-RV64-NEXT: li a3, 174 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 16 +; CHECK-RV64-NEXT: bltz a1, .LBB61_698 +; CHECK-RV64-NEXT: j .LBB61_180 +; CHECK-RV64-NEXT: .LBB61_698: # %cond.load697 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 176 +; CHECK-RV64-NEXT: li a3, 175 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 15 +; CHECK-RV64-NEXT: bltz a1, .LBB61_699 +; CHECK-RV64-NEXT: j .LBB61_181 +; CHECK-RV64-NEXT: .LBB61_699: # %cond.load701 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 177 +; CHECK-RV64-NEXT: li a3, 176 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 14 +; CHECK-RV64-NEXT: bltz a1, .LBB61_700 +; CHECK-RV64-NEXT: j .LBB61_182 +; CHECK-RV64-NEXT: .LBB61_700: # %cond.load705 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 178 +; CHECK-RV64-NEXT: li a3, 177 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 13 +; CHECK-RV64-NEXT: bltz a1, .LBB61_701 +; CHECK-RV64-NEXT: j .LBB61_183 +; CHECK-RV64-NEXT: .LBB61_701: # %cond.load709 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 179 +; CHECK-RV64-NEXT: li a3, 178 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 12 +; CHECK-RV64-NEXT: bltz a1, .LBB61_702 +; CHECK-RV64-NEXT: j .LBB61_184 +; CHECK-RV64-NEXT: .LBB61_702: # %cond.load713 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 180 +; CHECK-RV64-NEXT: li a3, 179 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 11 +; CHECK-RV64-NEXT: bltz a1, .LBB61_703 +; CHECK-RV64-NEXT: j .LBB61_185 +; CHECK-RV64-NEXT: .LBB61_703: # %cond.load717 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 181 +; CHECK-RV64-NEXT: li a3, 180 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 10 +; CHECK-RV64-NEXT: bltz a1, .LBB61_704 +; CHECK-RV64-NEXT: j .LBB61_186 +; CHECK-RV64-NEXT: .LBB61_704: # %cond.load721 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 182 +; CHECK-RV64-NEXT: li a3, 181 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 9 +; CHECK-RV64-NEXT: bltz a1, .LBB61_705 +; CHECK-RV64-NEXT: j .LBB61_187 +; CHECK-RV64-NEXT: .LBB61_705: # %cond.load725 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 183 +; CHECK-RV64-NEXT: li a3, 182 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 8 +; CHECK-RV64-NEXT: bltz a1, .LBB61_706 +; CHECK-RV64-NEXT: j .LBB61_188 +; CHECK-RV64-NEXT: .LBB61_706: # %cond.load729 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 184 +; CHECK-RV64-NEXT: li a3, 183 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 7 +; CHECK-RV64-NEXT: bltz a1, .LBB61_707 +; CHECK-RV64-NEXT: j .LBB61_189 +; CHECK-RV64-NEXT: .LBB61_707: # %cond.load733 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 185 +; CHECK-RV64-NEXT: li a3, 184 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 6 +; CHECK-RV64-NEXT: bltz a1, .LBB61_708 +; CHECK-RV64-NEXT: j .LBB61_190 +; CHECK-RV64-NEXT: .LBB61_708: # %cond.load737 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 186 +; CHECK-RV64-NEXT: li a3, 185 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 5 +; CHECK-RV64-NEXT: bltz a1, .LBB61_709 +; CHECK-RV64-NEXT: j .LBB61_191 +; CHECK-RV64-NEXT: .LBB61_709: # %cond.load741 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 187 +; CHECK-RV64-NEXT: li a3, 186 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 4 +; CHECK-RV64-NEXT: bltz a1, .LBB61_710 +; CHECK-RV64-NEXT: j .LBB61_192 +; CHECK-RV64-NEXT: .LBB61_710: # %cond.load745 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 188 +; CHECK-RV64-NEXT: li a3, 187 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 3 +; CHECK-RV64-NEXT: bltz a1, .LBB61_711 +; CHECK-RV64-NEXT: j .LBB61_193 +; CHECK-RV64-NEXT: .LBB61_711: # %cond.load749 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 189 +; CHECK-RV64-NEXT: li a3, 188 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a1, a2, 2 +; CHECK-RV64-NEXT: bgez a1, .LBB61_1027 +; CHECK-RV64-NEXT: j .LBB61_194 +; CHECK-RV64-NEXT: .LBB61_1027: # %cond.load749 +; CHECK-RV64-NEXT: j .LBB61_195 +; CHECK-RV64-NEXT: .LBB61_712: # %cond.load761 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 192 +; CHECK-RV64-NEXT: li a3, 191 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 1 +; CHECK-RV64-NEXT: bnez a2, .LBB61_713 +; CHECK-RV64-NEXT: j .LBB61_199 +; CHECK-RV64-NEXT: .LBB61_713: # %cond.load765 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 193 +; CHECK-RV64-NEXT: li a3, 192 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 2 +; CHECK-RV64-NEXT: bnez a2, .LBB61_714 +; CHECK-RV64-NEXT: j .LBB61_200 +; CHECK-RV64-NEXT: .LBB61_714: # %cond.load769 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 194 +; CHECK-RV64-NEXT: li a3, 193 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 4 +; CHECK-RV64-NEXT: bnez a2, .LBB61_715 +; CHECK-RV64-NEXT: j .LBB61_201 +; CHECK-RV64-NEXT: .LBB61_715: # %cond.load773 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 195 +; CHECK-RV64-NEXT: li a3, 194 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 8 +; CHECK-RV64-NEXT: bnez a2, .LBB61_716 +; CHECK-RV64-NEXT: j .LBB61_202 +; CHECK-RV64-NEXT: .LBB61_716: # %cond.load777 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 196 +; CHECK-RV64-NEXT: li a3, 195 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 16 +; CHECK-RV64-NEXT: bnez a2, .LBB61_717 +; CHECK-RV64-NEXT: j .LBB61_203 +; CHECK-RV64-NEXT: .LBB61_717: # %cond.load781 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 197 +; CHECK-RV64-NEXT: li a3, 196 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 32 +; CHECK-RV64-NEXT: bnez a2, .LBB61_718 +; CHECK-RV64-NEXT: j .LBB61_204 +; CHECK-RV64-NEXT: .LBB61_718: # %cond.load785 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 198 +; CHECK-RV64-NEXT: li a3, 197 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 64 +; CHECK-RV64-NEXT: bnez a2, .LBB61_719 +; CHECK-RV64-NEXT: j .LBB61_205 +; CHECK-RV64-NEXT: .LBB61_719: # %cond.load789 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 199 +; CHECK-RV64-NEXT: li a3, 198 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 128 +; CHECK-RV64-NEXT: bnez a2, .LBB61_720 +; CHECK-RV64-NEXT: j .LBB61_206 +; CHECK-RV64-NEXT: .LBB61_720: # %cond.load793 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 200 +; CHECK-RV64-NEXT: li a3, 199 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 256 +; CHECK-RV64-NEXT: bnez a2, .LBB61_721 +; CHECK-RV64-NEXT: j .LBB61_207 +; CHECK-RV64-NEXT: .LBB61_721: # %cond.load797 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 201 +; CHECK-RV64-NEXT: li a3, 200 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 512 +; CHECK-RV64-NEXT: bnez a2, .LBB61_722 +; CHECK-RV64-NEXT: j .LBB61_208 +; CHECK-RV64-NEXT: .LBB61_722: # %cond.load801 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 202 +; CHECK-RV64-NEXT: li a3, 201 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a2, a1, 1024 +; CHECK-RV64-NEXT: bnez a2, .LBB61_723 +; CHECK-RV64-NEXT: j .LBB61_209 +; CHECK-RV64-NEXT: .LBB61_723: # %cond.load805 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 203 +; CHECK-RV64-NEXT: li a3, 202 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 52 +; CHECK-RV64-NEXT: bltz a2, .LBB61_724 +; CHECK-RV64-NEXT: j .LBB61_210 +; CHECK-RV64-NEXT: .LBB61_724: # %cond.load809 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 204 +; CHECK-RV64-NEXT: li a3, 203 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 51 +; CHECK-RV64-NEXT: bltz a2, .LBB61_725 +; CHECK-RV64-NEXT: j .LBB61_211 +; CHECK-RV64-NEXT: .LBB61_725: # %cond.load813 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 205 +; CHECK-RV64-NEXT: li a3, 204 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 50 +; CHECK-RV64-NEXT: bltz a2, .LBB61_726 +; CHECK-RV64-NEXT: j .LBB61_212 +; CHECK-RV64-NEXT: .LBB61_726: # %cond.load817 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 206 +; CHECK-RV64-NEXT: li a3, 205 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 49 +; CHECK-RV64-NEXT: bltz a2, .LBB61_727 +; CHECK-RV64-NEXT: j .LBB61_213 +; CHECK-RV64-NEXT: .LBB61_727: # %cond.load821 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 207 +; CHECK-RV64-NEXT: li a3, 206 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 48 +; CHECK-RV64-NEXT: bltz a2, .LBB61_728 +; CHECK-RV64-NEXT: j .LBB61_214 +; CHECK-RV64-NEXT: .LBB61_728: # %cond.load825 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 208 +; CHECK-RV64-NEXT: li a3, 207 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 47 +; CHECK-RV64-NEXT: bltz a2, .LBB61_729 +; CHECK-RV64-NEXT: j .LBB61_215 +; CHECK-RV64-NEXT: .LBB61_729: # %cond.load829 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 209 +; CHECK-RV64-NEXT: li a3, 208 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 46 +; CHECK-RV64-NEXT: bltz a2, .LBB61_730 +; CHECK-RV64-NEXT: j .LBB61_216 +; CHECK-RV64-NEXT: .LBB61_730: # %cond.load833 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 210 +; CHECK-RV64-NEXT: li a3, 209 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 45 +; CHECK-RV64-NEXT: bltz a2, .LBB61_731 +; CHECK-RV64-NEXT: j .LBB61_217 +; CHECK-RV64-NEXT: .LBB61_731: # %cond.load837 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 211 +; CHECK-RV64-NEXT: li a3, 210 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 44 +; CHECK-RV64-NEXT: bltz a2, .LBB61_732 +; CHECK-RV64-NEXT: j .LBB61_218 +; CHECK-RV64-NEXT: .LBB61_732: # %cond.load841 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 212 +; CHECK-RV64-NEXT: li a3, 211 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 43 +; CHECK-RV64-NEXT: bltz a2, .LBB61_733 +; CHECK-RV64-NEXT: j .LBB61_219 +; CHECK-RV64-NEXT: .LBB61_733: # %cond.load845 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 213 +; CHECK-RV64-NEXT: li a3, 212 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 42 +; CHECK-RV64-NEXT: bltz a2, .LBB61_734 +; CHECK-RV64-NEXT: j .LBB61_220 +; CHECK-RV64-NEXT: .LBB61_734: # %cond.load849 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 214 +; CHECK-RV64-NEXT: li a3, 213 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 41 +; CHECK-RV64-NEXT: bltz a2, .LBB61_735 +; CHECK-RV64-NEXT: j .LBB61_221 +; CHECK-RV64-NEXT: .LBB61_735: # %cond.load853 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 215 +; CHECK-RV64-NEXT: li a3, 214 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 40 +; CHECK-RV64-NEXT: bltz a2, .LBB61_736 +; CHECK-RV64-NEXT: j .LBB61_222 +; CHECK-RV64-NEXT: .LBB61_736: # %cond.load857 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 216 +; CHECK-RV64-NEXT: li a3, 215 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 39 +; CHECK-RV64-NEXT: bltz a2, .LBB61_737 +; CHECK-RV64-NEXT: j .LBB61_223 +; CHECK-RV64-NEXT: .LBB61_737: # %cond.load861 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 217 +; CHECK-RV64-NEXT: li a3, 216 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 38 +; CHECK-RV64-NEXT: bltz a2, .LBB61_738 +; CHECK-RV64-NEXT: j .LBB61_224 +; CHECK-RV64-NEXT: .LBB61_738: # %cond.load865 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 218 +; CHECK-RV64-NEXT: li a3, 217 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 37 +; CHECK-RV64-NEXT: bltz a2, .LBB61_739 +; CHECK-RV64-NEXT: j .LBB61_225 +; CHECK-RV64-NEXT: .LBB61_739: # %cond.load869 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 219 +; CHECK-RV64-NEXT: li a3, 218 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 36 +; CHECK-RV64-NEXT: bltz a2, .LBB61_740 +; CHECK-RV64-NEXT: j .LBB61_226 +; CHECK-RV64-NEXT: .LBB61_740: # %cond.load873 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 220 +; CHECK-RV64-NEXT: li a3, 219 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 35 +; CHECK-RV64-NEXT: bltz a2, .LBB61_741 +; CHECK-RV64-NEXT: j .LBB61_227 +; CHECK-RV64-NEXT: .LBB61_741: # %cond.load877 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 221 +; CHECK-RV64-NEXT: li a3, 220 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 34 +; CHECK-RV64-NEXT: bltz a2, .LBB61_742 +; CHECK-RV64-NEXT: j .LBB61_228 +; CHECK-RV64-NEXT: .LBB61_742: # %cond.load881 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 222 +; CHECK-RV64-NEXT: li a3, 221 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 33 +; CHECK-RV64-NEXT: bltz a2, .LBB61_743 +; CHECK-RV64-NEXT: j .LBB61_229 +; CHECK-RV64-NEXT: .LBB61_743: # %cond.load885 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 223 +; CHECK-RV64-NEXT: li a3, 222 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 32 +; CHECK-RV64-NEXT: bltz a2, .LBB61_744 +; CHECK-RV64-NEXT: j .LBB61_230 +; CHECK-RV64-NEXT: .LBB61_744: # %cond.load889 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 224 +; CHECK-RV64-NEXT: li a3, 223 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 31 +; CHECK-RV64-NEXT: bltz a2, .LBB61_745 +; CHECK-RV64-NEXT: j .LBB61_231 +; CHECK-RV64-NEXT: .LBB61_745: # %cond.load893 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 225 +; CHECK-RV64-NEXT: li a3, 224 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 30 +; CHECK-RV64-NEXT: bltz a2, .LBB61_746 +; CHECK-RV64-NEXT: j .LBB61_232 +; CHECK-RV64-NEXT: .LBB61_746: # %cond.load897 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 226 +; CHECK-RV64-NEXT: li a3, 225 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 29 +; CHECK-RV64-NEXT: bltz a2, .LBB61_747 +; CHECK-RV64-NEXT: j .LBB61_233 +; CHECK-RV64-NEXT: .LBB61_747: # %cond.load901 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 227 +; CHECK-RV64-NEXT: li a3, 226 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 28 +; CHECK-RV64-NEXT: bltz a2, .LBB61_748 +; CHECK-RV64-NEXT: j .LBB61_234 +; CHECK-RV64-NEXT: .LBB61_748: # %cond.load905 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 228 +; CHECK-RV64-NEXT: li a3, 227 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 27 +; CHECK-RV64-NEXT: bltz a2, .LBB61_749 +; CHECK-RV64-NEXT: j .LBB61_235 +; CHECK-RV64-NEXT: .LBB61_749: # %cond.load909 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 229 +; CHECK-RV64-NEXT: li a3, 228 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 26 +; CHECK-RV64-NEXT: bltz a2, .LBB61_750 +; CHECK-RV64-NEXT: j .LBB61_236 +; CHECK-RV64-NEXT: .LBB61_750: # %cond.load913 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 230 +; CHECK-RV64-NEXT: li a3, 229 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 25 +; CHECK-RV64-NEXT: bltz a2, .LBB61_751 +; CHECK-RV64-NEXT: j .LBB61_237 +; CHECK-RV64-NEXT: .LBB61_751: # %cond.load917 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 231 +; CHECK-RV64-NEXT: li a3, 230 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 24 +; CHECK-RV64-NEXT: bltz a2, .LBB61_752 +; CHECK-RV64-NEXT: j .LBB61_238 +; CHECK-RV64-NEXT: .LBB61_752: # %cond.load921 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 232 +; CHECK-RV64-NEXT: li a3, 231 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 23 +; CHECK-RV64-NEXT: bltz a2, .LBB61_753 +; CHECK-RV64-NEXT: j .LBB61_239 +; CHECK-RV64-NEXT: .LBB61_753: # %cond.load925 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 233 +; CHECK-RV64-NEXT: li a3, 232 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 22 +; CHECK-RV64-NEXT: bltz a2, .LBB61_754 +; CHECK-RV64-NEXT: j .LBB61_240 +; CHECK-RV64-NEXT: .LBB61_754: # %cond.load929 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 234 +; CHECK-RV64-NEXT: li a3, 233 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 21 +; CHECK-RV64-NEXT: bltz a2, .LBB61_755 +; CHECK-RV64-NEXT: j .LBB61_241 +; CHECK-RV64-NEXT: .LBB61_755: # %cond.load933 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 235 +; CHECK-RV64-NEXT: li a3, 234 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 20 +; CHECK-RV64-NEXT: bltz a2, .LBB61_756 +; CHECK-RV64-NEXT: j .LBB61_242 +; CHECK-RV64-NEXT: .LBB61_756: # %cond.load937 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 236 +; CHECK-RV64-NEXT: li a3, 235 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 19 +; CHECK-RV64-NEXT: bltz a2, .LBB61_757 +; CHECK-RV64-NEXT: j .LBB61_243 +; CHECK-RV64-NEXT: .LBB61_757: # %cond.load941 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 237 +; CHECK-RV64-NEXT: li a3, 236 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 18 +; CHECK-RV64-NEXT: bltz a2, .LBB61_758 +; CHECK-RV64-NEXT: j .LBB61_244 +; CHECK-RV64-NEXT: .LBB61_758: # %cond.load945 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 238 +; CHECK-RV64-NEXT: li a3, 237 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 17 +; CHECK-RV64-NEXT: bltz a2, .LBB61_759 +; CHECK-RV64-NEXT: j .LBB61_245 +; CHECK-RV64-NEXT: .LBB61_759: # %cond.load949 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 239 +; CHECK-RV64-NEXT: li a3, 238 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 16 +; CHECK-RV64-NEXT: bltz a2, .LBB61_760 +; CHECK-RV64-NEXT: j .LBB61_246 +; CHECK-RV64-NEXT: .LBB61_760: # %cond.load953 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 240 +; CHECK-RV64-NEXT: li a3, 239 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 15 +; CHECK-RV64-NEXT: bltz a2, .LBB61_761 +; CHECK-RV64-NEXT: j .LBB61_247 +; CHECK-RV64-NEXT: .LBB61_761: # %cond.load957 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 241 +; CHECK-RV64-NEXT: li a3, 240 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 14 +; CHECK-RV64-NEXT: bltz a2, .LBB61_762 +; CHECK-RV64-NEXT: j .LBB61_248 +; CHECK-RV64-NEXT: .LBB61_762: # %cond.load961 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 242 +; CHECK-RV64-NEXT: li a3, 241 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 13 +; CHECK-RV64-NEXT: bltz a2, .LBB61_763 +; CHECK-RV64-NEXT: j .LBB61_249 +; CHECK-RV64-NEXT: .LBB61_763: # %cond.load965 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 243 +; CHECK-RV64-NEXT: li a3, 242 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 12 +; CHECK-RV64-NEXT: bltz a2, .LBB61_764 +; CHECK-RV64-NEXT: j .LBB61_250 +; CHECK-RV64-NEXT: .LBB61_764: # %cond.load969 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 244 +; CHECK-RV64-NEXT: li a3, 243 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 11 +; CHECK-RV64-NEXT: bltz a2, .LBB61_765 +; CHECK-RV64-NEXT: j .LBB61_251 +; CHECK-RV64-NEXT: .LBB61_765: # %cond.load973 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 245 +; CHECK-RV64-NEXT: li a3, 244 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 10 +; CHECK-RV64-NEXT: bltz a2, .LBB61_766 +; CHECK-RV64-NEXT: j .LBB61_252 +; CHECK-RV64-NEXT: .LBB61_766: # %cond.load977 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 246 +; CHECK-RV64-NEXT: li a3, 245 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 9 +; CHECK-RV64-NEXT: bltz a2, .LBB61_767 +; CHECK-RV64-NEXT: j .LBB61_253 +; CHECK-RV64-NEXT: .LBB61_767: # %cond.load981 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 247 +; CHECK-RV64-NEXT: li a3, 246 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 8 +; CHECK-RV64-NEXT: bltz a2, .LBB61_768 +; CHECK-RV64-NEXT: j .LBB61_254 +; CHECK-RV64-NEXT: .LBB61_768: # %cond.load985 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 248 +; CHECK-RV64-NEXT: li a3, 247 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 7 +; CHECK-RV64-NEXT: bltz a2, .LBB61_769 +; CHECK-RV64-NEXT: j .LBB61_255 +; CHECK-RV64-NEXT: .LBB61_769: # %cond.load989 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 249 +; CHECK-RV64-NEXT: li a3, 248 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 6 +; CHECK-RV64-NEXT: bltz a2, .LBB61_770 +; CHECK-RV64-NEXT: j .LBB61_256 +; CHECK-RV64-NEXT: .LBB61_770: # %cond.load993 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 250 +; CHECK-RV64-NEXT: li a3, 249 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 5 +; CHECK-RV64-NEXT: bltz a2, .LBB61_771 +; CHECK-RV64-NEXT: j .LBB61_257 +; CHECK-RV64-NEXT: .LBB61_771: # %cond.load997 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 251 +; CHECK-RV64-NEXT: li a3, 250 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 4 +; CHECK-RV64-NEXT: bltz a2, .LBB61_772 +; CHECK-RV64-NEXT: j .LBB61_258 +; CHECK-RV64-NEXT: .LBB61_772: # %cond.load1001 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 252 +; CHECK-RV64-NEXT: li a3, 251 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 3 +; CHECK-RV64-NEXT: bltz a2, .LBB61_773 +; CHECK-RV64-NEXT: j .LBB61_259 +; CHECK-RV64-NEXT: .LBB61_773: # %cond.load1005 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a2, 253 +; CHECK-RV64-NEXT: li a3, 252 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: slli a2, a1, 2 +; CHECK-RV64-NEXT: bgez a2, .LBB61_1028 +; CHECK-RV64-NEXT: j .LBB61_260 +; CHECK-RV64-NEXT: .LBB61_1028: # %cond.load1005 +; CHECK-RV64-NEXT: j .LBB61_261 +; CHECK-RV64-NEXT: .LBB61_774: # %cond.load1017 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: vmv8r.v v24, v8 +; CHECK-RV64-NEXT: li a1, 256 +; CHECK-RV64-NEXT: li a3, 255 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: vmv4r.v v24, v8 +; CHECK-RV64-NEXT: vmv8r.v v8, v24 +; CHECK-RV64-NEXT: andi a1, a2, 1 +; CHECK-RV64-NEXT: bnez a1, .LBB61_775 +; CHECK-RV64-NEXT: j .LBB61_265 +; CHECK-RV64-NEXT: .LBB61_775: # %cond.load1021 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 257 +; CHECK-RV64-NEXT: li a3, 256 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 2 +; CHECK-RV64-NEXT: bnez a1, .LBB61_776 +; CHECK-RV64-NEXT: j .LBB61_266 +; CHECK-RV64-NEXT: .LBB61_776: # %cond.load1025 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 258 +; CHECK-RV64-NEXT: li a3, 257 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 4 +; CHECK-RV64-NEXT: bnez a1, .LBB61_777 +; CHECK-RV64-NEXT: j .LBB61_267 +; CHECK-RV64-NEXT: .LBB61_777: # %cond.load1029 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 259 +; CHECK-RV64-NEXT: li a3, 258 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 8 +; CHECK-RV64-NEXT: bnez a1, .LBB61_778 +; CHECK-RV64-NEXT: j .LBB61_268 +; CHECK-RV64-NEXT: .LBB61_778: # %cond.load1033 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 260 +; CHECK-RV64-NEXT: li a3, 259 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 16 +; CHECK-RV64-NEXT: bnez a1, .LBB61_779 +; CHECK-RV64-NEXT: j .LBB61_269 +; CHECK-RV64-NEXT: .LBB61_779: # %cond.load1037 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 261 +; CHECK-RV64-NEXT: li a3, 260 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 32 +; CHECK-RV64-NEXT: bnez a1, .LBB61_780 +; CHECK-RV64-NEXT: j .LBB61_270 +; CHECK-RV64-NEXT: .LBB61_780: # %cond.load1041 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 262 +; CHECK-RV64-NEXT: li a3, 261 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 64 +; CHECK-RV64-NEXT: bnez a1, .LBB61_781 +; CHECK-RV64-NEXT: j .LBB61_271 +; CHECK-RV64-NEXT: .LBB61_781: # %cond.load1045 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 263 +; CHECK-RV64-NEXT: li a3, 262 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 128 +; CHECK-RV64-NEXT: bnez a1, .LBB61_782 +; CHECK-RV64-NEXT: j .LBB61_272 +; CHECK-RV64-NEXT: .LBB61_782: # %cond.load1049 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 264 +; CHECK-RV64-NEXT: li a3, 263 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 256 +; CHECK-RV64-NEXT: bnez a1, .LBB61_783 +; CHECK-RV64-NEXT: j .LBB61_273 +; CHECK-RV64-NEXT: .LBB61_783: # %cond.load1053 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 265 +; CHECK-RV64-NEXT: li a3, 264 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 512 +; CHECK-RV64-NEXT: bnez a1, .LBB61_784 +; CHECK-RV64-NEXT: j .LBB61_274 +; CHECK-RV64-NEXT: .LBB61_784: # %cond.load1057 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 266 +; CHECK-RV64-NEXT: li a3, 265 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 1024 +; CHECK-RV64-NEXT: bnez a1, .LBB61_785 +; CHECK-RV64-NEXT: j .LBB61_275 +; CHECK-RV64-NEXT: .LBB61_785: # %cond.load1061 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 267 +; CHECK-RV64-NEXT: li a3, 266 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 52 +; CHECK-RV64-NEXT: bltz a1, .LBB61_786 +; CHECK-RV64-NEXT: j .LBB61_276 +; CHECK-RV64-NEXT: .LBB61_786: # %cond.load1065 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 268 +; CHECK-RV64-NEXT: li a3, 267 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 51 +; CHECK-RV64-NEXT: bltz a1, .LBB61_787 +; CHECK-RV64-NEXT: j .LBB61_277 +; CHECK-RV64-NEXT: .LBB61_787: # %cond.load1069 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 269 +; CHECK-RV64-NEXT: li a3, 268 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 50 +; CHECK-RV64-NEXT: bltz a1, .LBB61_788 +; CHECK-RV64-NEXT: j .LBB61_278 +; CHECK-RV64-NEXT: .LBB61_788: # %cond.load1073 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 270 +; CHECK-RV64-NEXT: li a3, 269 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 49 +; CHECK-RV64-NEXT: bltz a1, .LBB61_789 +; CHECK-RV64-NEXT: j .LBB61_279 +; CHECK-RV64-NEXT: .LBB61_789: # %cond.load1077 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 271 +; CHECK-RV64-NEXT: li a3, 270 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 48 +; CHECK-RV64-NEXT: bltz a1, .LBB61_790 +; CHECK-RV64-NEXT: j .LBB61_280 +; CHECK-RV64-NEXT: .LBB61_790: # %cond.load1081 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 272 +; CHECK-RV64-NEXT: li a3, 271 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 47 +; CHECK-RV64-NEXT: bltz a1, .LBB61_791 +; CHECK-RV64-NEXT: j .LBB61_281 +; CHECK-RV64-NEXT: .LBB61_791: # %cond.load1085 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 273 +; CHECK-RV64-NEXT: li a3, 272 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 46 +; CHECK-RV64-NEXT: bltz a1, .LBB61_792 +; CHECK-RV64-NEXT: j .LBB61_282 +; CHECK-RV64-NEXT: .LBB61_792: # %cond.load1089 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 274 +; CHECK-RV64-NEXT: li a3, 273 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 45 +; CHECK-RV64-NEXT: bltz a1, .LBB61_793 +; CHECK-RV64-NEXT: j .LBB61_283 +; CHECK-RV64-NEXT: .LBB61_793: # %cond.load1093 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 275 +; CHECK-RV64-NEXT: li a3, 274 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 44 +; CHECK-RV64-NEXT: bltz a1, .LBB61_794 +; CHECK-RV64-NEXT: j .LBB61_284 +; CHECK-RV64-NEXT: .LBB61_794: # %cond.load1097 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 276 +; CHECK-RV64-NEXT: li a3, 275 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 43 +; CHECK-RV64-NEXT: bltz a1, .LBB61_795 +; CHECK-RV64-NEXT: j .LBB61_285 +; CHECK-RV64-NEXT: .LBB61_795: # %cond.load1101 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 277 +; CHECK-RV64-NEXT: li a3, 276 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 42 +; CHECK-RV64-NEXT: bltz a1, .LBB61_796 +; CHECK-RV64-NEXT: j .LBB61_286 +; CHECK-RV64-NEXT: .LBB61_796: # %cond.load1105 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 278 +; CHECK-RV64-NEXT: li a3, 277 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 41 +; CHECK-RV64-NEXT: bltz a1, .LBB61_797 +; CHECK-RV64-NEXT: j .LBB61_287 +; CHECK-RV64-NEXT: .LBB61_797: # %cond.load1109 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 279 +; CHECK-RV64-NEXT: li a3, 278 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 40 +; CHECK-RV64-NEXT: bltz a1, .LBB61_798 +; CHECK-RV64-NEXT: j .LBB61_288 +; CHECK-RV64-NEXT: .LBB61_798: # %cond.load1113 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 280 +; CHECK-RV64-NEXT: li a3, 279 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 39 +; CHECK-RV64-NEXT: bltz a1, .LBB61_799 +; CHECK-RV64-NEXT: j .LBB61_289 +; CHECK-RV64-NEXT: .LBB61_799: # %cond.load1117 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 281 +; CHECK-RV64-NEXT: li a3, 280 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 38 +; CHECK-RV64-NEXT: bltz a1, .LBB61_800 +; CHECK-RV64-NEXT: j .LBB61_290 +; CHECK-RV64-NEXT: .LBB61_800: # %cond.load1121 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 282 +; CHECK-RV64-NEXT: li a3, 281 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 37 +; CHECK-RV64-NEXT: bltz a1, .LBB61_801 +; CHECK-RV64-NEXT: j .LBB61_291 +; CHECK-RV64-NEXT: .LBB61_801: # %cond.load1125 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 283 +; CHECK-RV64-NEXT: li a3, 282 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 36 +; CHECK-RV64-NEXT: bltz a1, .LBB61_802 +; CHECK-RV64-NEXT: j .LBB61_292 +; CHECK-RV64-NEXT: .LBB61_802: # %cond.load1129 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 284 +; CHECK-RV64-NEXT: li a3, 283 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 35 +; CHECK-RV64-NEXT: bltz a1, .LBB61_803 +; CHECK-RV64-NEXT: j .LBB61_293 +; CHECK-RV64-NEXT: .LBB61_803: # %cond.load1133 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 285 +; CHECK-RV64-NEXT: li a3, 284 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 34 +; CHECK-RV64-NEXT: bltz a1, .LBB61_804 +; CHECK-RV64-NEXT: j .LBB61_294 +; CHECK-RV64-NEXT: .LBB61_804: # %cond.load1137 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 286 +; CHECK-RV64-NEXT: li a3, 285 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 33 +; CHECK-RV64-NEXT: bltz a1, .LBB61_805 +; CHECK-RV64-NEXT: j .LBB61_295 +; CHECK-RV64-NEXT: .LBB61_805: # %cond.load1141 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 287 +; CHECK-RV64-NEXT: li a3, 286 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 32 +; CHECK-RV64-NEXT: bltz a1, .LBB61_806 +; CHECK-RV64-NEXT: j .LBB61_296 +; CHECK-RV64-NEXT: .LBB61_806: # %cond.load1145 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 288 +; CHECK-RV64-NEXT: li a3, 287 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 31 +; CHECK-RV64-NEXT: bltz a1, .LBB61_807 +; CHECK-RV64-NEXT: j .LBB61_297 +; CHECK-RV64-NEXT: .LBB61_807: # %cond.load1149 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 289 +; CHECK-RV64-NEXT: li a3, 288 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 30 +; CHECK-RV64-NEXT: bltz a1, .LBB61_808 +; CHECK-RV64-NEXT: j .LBB61_298 +; CHECK-RV64-NEXT: .LBB61_808: # %cond.load1153 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 290 +; CHECK-RV64-NEXT: li a3, 289 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 29 +; CHECK-RV64-NEXT: bltz a1, .LBB61_809 +; CHECK-RV64-NEXT: j .LBB61_299 +; CHECK-RV64-NEXT: .LBB61_809: # %cond.load1157 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 291 +; CHECK-RV64-NEXT: li a3, 290 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 28 +; CHECK-RV64-NEXT: bltz a1, .LBB61_810 +; CHECK-RV64-NEXT: j .LBB61_300 +; CHECK-RV64-NEXT: .LBB61_810: # %cond.load1161 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 292 +; CHECK-RV64-NEXT: li a3, 291 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 27 +; CHECK-RV64-NEXT: bltz a1, .LBB61_811 +; CHECK-RV64-NEXT: j .LBB61_301 +; CHECK-RV64-NEXT: .LBB61_811: # %cond.load1165 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 293 +; CHECK-RV64-NEXT: li a3, 292 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 26 +; CHECK-RV64-NEXT: bltz a1, .LBB61_812 +; CHECK-RV64-NEXT: j .LBB61_302 +; CHECK-RV64-NEXT: .LBB61_812: # %cond.load1169 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 294 +; CHECK-RV64-NEXT: li a3, 293 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 25 +; CHECK-RV64-NEXT: bltz a1, .LBB61_813 +; CHECK-RV64-NEXT: j .LBB61_303 +; CHECK-RV64-NEXT: .LBB61_813: # %cond.load1173 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 295 +; CHECK-RV64-NEXT: li a3, 294 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 24 +; CHECK-RV64-NEXT: bltz a1, .LBB61_814 +; CHECK-RV64-NEXT: j .LBB61_304 +; CHECK-RV64-NEXT: .LBB61_814: # %cond.load1177 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 296 +; CHECK-RV64-NEXT: li a3, 295 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 23 +; CHECK-RV64-NEXT: bltz a1, .LBB61_815 +; CHECK-RV64-NEXT: j .LBB61_305 +; CHECK-RV64-NEXT: .LBB61_815: # %cond.load1181 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 297 +; CHECK-RV64-NEXT: li a3, 296 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 22 +; CHECK-RV64-NEXT: bltz a1, .LBB61_816 +; CHECK-RV64-NEXT: j .LBB61_306 +; CHECK-RV64-NEXT: .LBB61_816: # %cond.load1185 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 298 +; CHECK-RV64-NEXT: li a3, 297 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 21 +; CHECK-RV64-NEXT: bltz a1, .LBB61_817 +; CHECK-RV64-NEXT: j .LBB61_307 +; CHECK-RV64-NEXT: .LBB61_817: # %cond.load1189 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 299 +; CHECK-RV64-NEXT: li a3, 298 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 20 +; CHECK-RV64-NEXT: bltz a1, .LBB61_818 +; CHECK-RV64-NEXT: j .LBB61_308 +; CHECK-RV64-NEXT: .LBB61_818: # %cond.load1193 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 300 +; CHECK-RV64-NEXT: li a3, 299 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 19 +; CHECK-RV64-NEXT: bltz a1, .LBB61_819 +; CHECK-RV64-NEXT: j .LBB61_309 +; CHECK-RV64-NEXT: .LBB61_819: # %cond.load1197 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 301 +; CHECK-RV64-NEXT: li a3, 300 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 18 +; CHECK-RV64-NEXT: bltz a1, .LBB61_820 +; CHECK-RV64-NEXT: j .LBB61_310 +; CHECK-RV64-NEXT: .LBB61_820: # %cond.load1201 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 302 +; CHECK-RV64-NEXT: li a3, 301 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 17 +; CHECK-RV64-NEXT: bltz a1, .LBB61_821 +; CHECK-RV64-NEXT: j .LBB61_311 +; CHECK-RV64-NEXT: .LBB61_821: # %cond.load1205 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 303 +; CHECK-RV64-NEXT: li a3, 302 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 16 +; CHECK-RV64-NEXT: bltz a1, .LBB61_822 +; CHECK-RV64-NEXT: j .LBB61_312 +; CHECK-RV64-NEXT: .LBB61_822: # %cond.load1209 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 304 +; CHECK-RV64-NEXT: li a3, 303 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 15 +; CHECK-RV64-NEXT: bltz a1, .LBB61_823 +; CHECK-RV64-NEXT: j .LBB61_313 +; CHECK-RV64-NEXT: .LBB61_823: # %cond.load1213 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 305 +; CHECK-RV64-NEXT: li a3, 304 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 14 +; CHECK-RV64-NEXT: bltz a1, .LBB61_824 +; CHECK-RV64-NEXT: j .LBB61_314 +; CHECK-RV64-NEXT: .LBB61_824: # %cond.load1217 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 306 +; CHECK-RV64-NEXT: li a3, 305 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 13 +; CHECK-RV64-NEXT: bltz a1, .LBB61_825 +; CHECK-RV64-NEXT: j .LBB61_315 +; CHECK-RV64-NEXT: .LBB61_825: # %cond.load1221 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 307 +; CHECK-RV64-NEXT: li a3, 306 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 12 +; CHECK-RV64-NEXT: bltz a1, .LBB61_826 +; CHECK-RV64-NEXT: j .LBB61_316 +; CHECK-RV64-NEXT: .LBB61_826: # %cond.load1225 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 308 +; CHECK-RV64-NEXT: li a3, 307 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 11 +; CHECK-RV64-NEXT: bltz a1, .LBB61_827 +; CHECK-RV64-NEXT: j .LBB61_317 +; CHECK-RV64-NEXT: .LBB61_827: # %cond.load1229 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 309 +; CHECK-RV64-NEXT: li a3, 308 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 10 +; CHECK-RV64-NEXT: bltz a1, .LBB61_828 +; CHECK-RV64-NEXT: j .LBB61_318 +; CHECK-RV64-NEXT: .LBB61_828: # %cond.load1233 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 310 +; CHECK-RV64-NEXT: li a3, 309 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 9 +; CHECK-RV64-NEXT: bltz a1, .LBB61_829 +; CHECK-RV64-NEXT: j .LBB61_319 +; CHECK-RV64-NEXT: .LBB61_829: # %cond.load1237 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 311 +; CHECK-RV64-NEXT: li a3, 310 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 8 +; CHECK-RV64-NEXT: bltz a1, .LBB61_830 +; CHECK-RV64-NEXT: j .LBB61_320 +; CHECK-RV64-NEXT: .LBB61_830: # %cond.load1241 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 312 +; CHECK-RV64-NEXT: li a3, 311 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 7 +; CHECK-RV64-NEXT: bltz a1, .LBB61_831 +; CHECK-RV64-NEXT: j .LBB61_321 +; CHECK-RV64-NEXT: .LBB61_831: # %cond.load1245 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 313 +; CHECK-RV64-NEXT: li a3, 312 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 6 +; CHECK-RV64-NEXT: bltz a1, .LBB61_832 +; CHECK-RV64-NEXT: j .LBB61_322 +; CHECK-RV64-NEXT: .LBB61_832: # %cond.load1249 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 314 +; CHECK-RV64-NEXT: li a3, 313 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 5 +; CHECK-RV64-NEXT: bltz a1, .LBB61_833 +; CHECK-RV64-NEXT: j .LBB61_323 +; CHECK-RV64-NEXT: .LBB61_833: # %cond.load1253 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 315 +; CHECK-RV64-NEXT: li a3, 314 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 4 +; CHECK-RV64-NEXT: bltz a1, .LBB61_834 +; CHECK-RV64-NEXT: j .LBB61_324 +; CHECK-RV64-NEXT: .LBB61_834: # %cond.load1257 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 316 +; CHECK-RV64-NEXT: li a3, 315 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 3 +; CHECK-RV64-NEXT: bltz a1, .LBB61_835 +; CHECK-RV64-NEXT: j .LBB61_325 +; CHECK-RV64-NEXT: .LBB61_835: # %cond.load1261 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 317 +; CHECK-RV64-NEXT: li a3, 316 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 2 +; CHECK-RV64-NEXT: bgez a1, .LBB61_1029 +; CHECK-RV64-NEXT: j .LBB61_326 +; CHECK-RV64-NEXT: .LBB61_1029: # %cond.load1261 +; CHECK-RV64-NEXT: j .LBB61_327 +; CHECK-RV64-NEXT: .LBB61_836: # %cond.load1273 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 320 +; CHECK-RV64-NEXT: li a3, 319 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 1 +; CHECK-RV64-NEXT: bnez a2, .LBB61_837 +; CHECK-RV64-NEXT: j .LBB61_331 +; CHECK-RV64-NEXT: .LBB61_837: # %cond.load1277 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 321 +; CHECK-RV64-NEXT: li a3, 320 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 2 +; CHECK-RV64-NEXT: bnez a2, .LBB61_838 +; CHECK-RV64-NEXT: j .LBB61_332 +; CHECK-RV64-NEXT: .LBB61_838: # %cond.load1281 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 322 +; CHECK-RV64-NEXT: li a3, 321 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 4 +; CHECK-RV64-NEXT: bnez a2, .LBB61_839 +; CHECK-RV64-NEXT: j .LBB61_333 +; CHECK-RV64-NEXT: .LBB61_839: # %cond.load1285 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 323 +; CHECK-RV64-NEXT: li a3, 322 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 8 +; CHECK-RV64-NEXT: bnez a2, .LBB61_840 +; CHECK-RV64-NEXT: j .LBB61_334 +; CHECK-RV64-NEXT: .LBB61_840: # %cond.load1289 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 324 +; CHECK-RV64-NEXT: li a3, 323 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 16 +; CHECK-RV64-NEXT: bnez a2, .LBB61_841 +; CHECK-RV64-NEXT: j .LBB61_335 +; CHECK-RV64-NEXT: .LBB61_841: # %cond.load1293 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 325 +; CHECK-RV64-NEXT: li a3, 324 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 32 +; CHECK-RV64-NEXT: bnez a2, .LBB61_842 +; CHECK-RV64-NEXT: j .LBB61_336 +; CHECK-RV64-NEXT: .LBB61_842: # %cond.load1297 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 326 +; CHECK-RV64-NEXT: li a3, 325 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 64 +; CHECK-RV64-NEXT: bnez a2, .LBB61_843 +; CHECK-RV64-NEXT: j .LBB61_337 +; CHECK-RV64-NEXT: .LBB61_843: # %cond.load1301 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 327 +; CHECK-RV64-NEXT: li a3, 326 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 128 +; CHECK-RV64-NEXT: bnez a2, .LBB61_844 +; CHECK-RV64-NEXT: j .LBB61_338 +; CHECK-RV64-NEXT: .LBB61_844: # %cond.load1305 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 328 +; CHECK-RV64-NEXT: li a3, 327 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 256 +; CHECK-RV64-NEXT: bnez a2, .LBB61_845 +; CHECK-RV64-NEXT: j .LBB61_339 +; CHECK-RV64-NEXT: .LBB61_845: # %cond.load1309 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 329 +; CHECK-RV64-NEXT: li a3, 328 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 512 +; CHECK-RV64-NEXT: bnez a2, .LBB61_846 +; CHECK-RV64-NEXT: j .LBB61_340 +; CHECK-RV64-NEXT: .LBB61_846: # %cond.load1313 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 330 +; CHECK-RV64-NEXT: li a3, 329 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 1024 +; CHECK-RV64-NEXT: bnez a2, .LBB61_847 +; CHECK-RV64-NEXT: j .LBB61_341 +; CHECK-RV64-NEXT: .LBB61_847: # %cond.load1317 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 331 +; CHECK-RV64-NEXT: li a3, 330 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 52 +; CHECK-RV64-NEXT: bltz a2, .LBB61_848 +; CHECK-RV64-NEXT: j .LBB61_342 +; CHECK-RV64-NEXT: .LBB61_848: # %cond.load1321 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 332 +; CHECK-RV64-NEXT: li a3, 331 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 51 +; CHECK-RV64-NEXT: bltz a2, .LBB61_849 +; CHECK-RV64-NEXT: j .LBB61_343 +; CHECK-RV64-NEXT: .LBB61_849: # %cond.load1325 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 333 +; CHECK-RV64-NEXT: li a3, 332 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 50 +; CHECK-RV64-NEXT: bltz a2, .LBB61_850 +; CHECK-RV64-NEXT: j .LBB61_344 +; CHECK-RV64-NEXT: .LBB61_850: # %cond.load1329 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 334 +; CHECK-RV64-NEXT: li a3, 333 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 49 +; CHECK-RV64-NEXT: bltz a2, .LBB61_851 +; CHECK-RV64-NEXT: j .LBB61_345 +; CHECK-RV64-NEXT: .LBB61_851: # %cond.load1333 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 335 +; CHECK-RV64-NEXT: li a3, 334 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 48 +; CHECK-RV64-NEXT: bltz a2, .LBB61_852 +; CHECK-RV64-NEXT: j .LBB61_346 +; CHECK-RV64-NEXT: .LBB61_852: # %cond.load1337 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 336 +; CHECK-RV64-NEXT: li a3, 335 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 47 +; CHECK-RV64-NEXT: bltz a2, .LBB61_853 +; CHECK-RV64-NEXT: j .LBB61_347 +; CHECK-RV64-NEXT: .LBB61_853: # %cond.load1341 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 337 +; CHECK-RV64-NEXT: li a3, 336 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 46 +; CHECK-RV64-NEXT: bltz a2, .LBB61_854 +; CHECK-RV64-NEXT: j .LBB61_348 +; CHECK-RV64-NEXT: .LBB61_854: # %cond.load1345 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 338 +; CHECK-RV64-NEXT: li a3, 337 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 45 +; CHECK-RV64-NEXT: bltz a2, .LBB61_855 +; CHECK-RV64-NEXT: j .LBB61_349 +; CHECK-RV64-NEXT: .LBB61_855: # %cond.load1349 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 339 +; CHECK-RV64-NEXT: li a3, 338 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 44 +; CHECK-RV64-NEXT: bltz a2, .LBB61_856 +; CHECK-RV64-NEXT: j .LBB61_350 +; CHECK-RV64-NEXT: .LBB61_856: # %cond.load1353 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 340 +; CHECK-RV64-NEXT: li a3, 339 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 43 +; CHECK-RV64-NEXT: bltz a2, .LBB61_857 +; CHECK-RV64-NEXT: j .LBB61_351 +; CHECK-RV64-NEXT: .LBB61_857: # %cond.load1357 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 341 +; CHECK-RV64-NEXT: li a3, 340 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 42 +; CHECK-RV64-NEXT: bltz a2, .LBB61_858 +; CHECK-RV64-NEXT: j .LBB61_352 +; CHECK-RV64-NEXT: .LBB61_858: # %cond.load1361 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 342 +; CHECK-RV64-NEXT: li a3, 341 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 41 +; CHECK-RV64-NEXT: bltz a2, .LBB61_859 +; CHECK-RV64-NEXT: j .LBB61_353 +; CHECK-RV64-NEXT: .LBB61_859: # %cond.load1365 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 343 +; CHECK-RV64-NEXT: li a3, 342 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 40 +; CHECK-RV64-NEXT: bltz a2, .LBB61_860 +; CHECK-RV64-NEXT: j .LBB61_354 +; CHECK-RV64-NEXT: .LBB61_860: # %cond.load1369 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 344 +; CHECK-RV64-NEXT: li a3, 343 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 39 +; CHECK-RV64-NEXT: bltz a2, .LBB61_861 +; CHECK-RV64-NEXT: j .LBB61_355 +; CHECK-RV64-NEXT: .LBB61_861: # %cond.load1373 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 345 +; CHECK-RV64-NEXT: li a3, 344 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 38 +; CHECK-RV64-NEXT: bltz a2, .LBB61_862 +; CHECK-RV64-NEXT: j .LBB61_356 +; CHECK-RV64-NEXT: .LBB61_862: # %cond.load1377 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 346 +; CHECK-RV64-NEXT: li a3, 345 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 37 +; CHECK-RV64-NEXT: bltz a2, .LBB61_863 +; CHECK-RV64-NEXT: j .LBB61_357 +; CHECK-RV64-NEXT: .LBB61_863: # %cond.load1381 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 347 +; CHECK-RV64-NEXT: li a3, 346 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 36 +; CHECK-RV64-NEXT: bltz a2, .LBB61_864 +; CHECK-RV64-NEXT: j .LBB61_358 +; CHECK-RV64-NEXT: .LBB61_864: # %cond.load1385 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 348 +; CHECK-RV64-NEXT: li a3, 347 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 35 +; CHECK-RV64-NEXT: bltz a2, .LBB61_865 +; CHECK-RV64-NEXT: j .LBB61_359 +; CHECK-RV64-NEXT: .LBB61_865: # %cond.load1389 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 349 +; CHECK-RV64-NEXT: li a3, 348 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 34 +; CHECK-RV64-NEXT: bltz a2, .LBB61_866 +; CHECK-RV64-NEXT: j .LBB61_360 +; CHECK-RV64-NEXT: .LBB61_866: # %cond.load1393 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 350 +; CHECK-RV64-NEXT: li a3, 349 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 33 +; CHECK-RV64-NEXT: bltz a2, .LBB61_867 +; CHECK-RV64-NEXT: j .LBB61_361 +; CHECK-RV64-NEXT: .LBB61_867: # %cond.load1397 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 351 +; CHECK-RV64-NEXT: li a3, 350 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 32 +; CHECK-RV64-NEXT: bltz a2, .LBB61_868 +; CHECK-RV64-NEXT: j .LBB61_362 +; CHECK-RV64-NEXT: .LBB61_868: # %cond.load1401 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 352 +; CHECK-RV64-NEXT: li a3, 351 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 31 +; CHECK-RV64-NEXT: bltz a2, .LBB61_869 +; CHECK-RV64-NEXT: j .LBB61_363 +; CHECK-RV64-NEXT: .LBB61_869: # %cond.load1405 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 353 +; CHECK-RV64-NEXT: li a3, 352 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 30 +; CHECK-RV64-NEXT: bltz a2, .LBB61_870 +; CHECK-RV64-NEXT: j .LBB61_364 +; CHECK-RV64-NEXT: .LBB61_870: # %cond.load1409 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 354 +; CHECK-RV64-NEXT: li a3, 353 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 29 +; CHECK-RV64-NEXT: bltz a2, .LBB61_871 +; CHECK-RV64-NEXT: j .LBB61_365 +; CHECK-RV64-NEXT: .LBB61_871: # %cond.load1413 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 355 +; CHECK-RV64-NEXT: li a3, 354 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 28 +; CHECK-RV64-NEXT: bltz a2, .LBB61_872 +; CHECK-RV64-NEXT: j .LBB61_366 +; CHECK-RV64-NEXT: .LBB61_872: # %cond.load1417 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 356 +; CHECK-RV64-NEXT: li a3, 355 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 27 +; CHECK-RV64-NEXT: bltz a2, .LBB61_873 +; CHECK-RV64-NEXT: j .LBB61_367 +; CHECK-RV64-NEXT: .LBB61_873: # %cond.load1421 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 357 +; CHECK-RV64-NEXT: li a3, 356 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 26 +; CHECK-RV64-NEXT: bltz a2, .LBB61_874 +; CHECK-RV64-NEXT: j .LBB61_368 +; CHECK-RV64-NEXT: .LBB61_874: # %cond.load1425 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 358 +; CHECK-RV64-NEXT: li a3, 357 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 25 +; CHECK-RV64-NEXT: bltz a2, .LBB61_875 +; CHECK-RV64-NEXT: j .LBB61_369 +; CHECK-RV64-NEXT: .LBB61_875: # %cond.load1429 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 359 +; CHECK-RV64-NEXT: li a3, 358 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 24 +; CHECK-RV64-NEXT: bltz a2, .LBB61_876 +; CHECK-RV64-NEXT: j .LBB61_370 +; CHECK-RV64-NEXT: .LBB61_876: # %cond.load1433 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 360 +; CHECK-RV64-NEXT: li a3, 359 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 23 +; CHECK-RV64-NEXT: bltz a2, .LBB61_877 +; CHECK-RV64-NEXT: j .LBB61_371 +; CHECK-RV64-NEXT: .LBB61_877: # %cond.load1437 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 361 +; CHECK-RV64-NEXT: li a3, 360 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 22 +; CHECK-RV64-NEXT: bltz a2, .LBB61_878 +; CHECK-RV64-NEXT: j .LBB61_372 +; CHECK-RV64-NEXT: .LBB61_878: # %cond.load1441 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 362 +; CHECK-RV64-NEXT: li a3, 361 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 21 +; CHECK-RV64-NEXT: bltz a2, .LBB61_879 +; CHECK-RV64-NEXT: j .LBB61_373 +; CHECK-RV64-NEXT: .LBB61_879: # %cond.load1445 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 363 +; CHECK-RV64-NEXT: li a3, 362 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 20 +; CHECK-RV64-NEXT: bltz a2, .LBB61_880 +; CHECK-RV64-NEXT: j .LBB61_374 +; CHECK-RV64-NEXT: .LBB61_880: # %cond.load1449 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 364 +; CHECK-RV64-NEXT: li a3, 363 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 19 +; CHECK-RV64-NEXT: bltz a2, .LBB61_881 +; CHECK-RV64-NEXT: j .LBB61_375 +; CHECK-RV64-NEXT: .LBB61_881: # %cond.load1453 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 365 +; CHECK-RV64-NEXT: li a3, 364 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 18 +; CHECK-RV64-NEXT: bltz a2, .LBB61_882 +; CHECK-RV64-NEXT: j .LBB61_376 +; CHECK-RV64-NEXT: .LBB61_882: # %cond.load1457 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 366 +; CHECK-RV64-NEXT: li a3, 365 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 17 +; CHECK-RV64-NEXT: bltz a2, .LBB61_883 +; CHECK-RV64-NEXT: j .LBB61_377 +; CHECK-RV64-NEXT: .LBB61_883: # %cond.load1461 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 367 +; CHECK-RV64-NEXT: li a3, 366 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 16 +; CHECK-RV64-NEXT: bltz a2, .LBB61_884 +; CHECK-RV64-NEXT: j .LBB61_378 +; CHECK-RV64-NEXT: .LBB61_884: # %cond.load1465 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 368 +; CHECK-RV64-NEXT: li a3, 367 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 15 +; CHECK-RV64-NEXT: bltz a2, .LBB61_885 +; CHECK-RV64-NEXT: j .LBB61_379 +; CHECK-RV64-NEXT: .LBB61_885: # %cond.load1469 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 369 +; CHECK-RV64-NEXT: li a3, 368 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 14 +; CHECK-RV64-NEXT: bltz a2, .LBB61_886 +; CHECK-RV64-NEXT: j .LBB61_380 +; CHECK-RV64-NEXT: .LBB61_886: # %cond.load1473 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 370 +; CHECK-RV64-NEXT: li a3, 369 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 13 +; CHECK-RV64-NEXT: bltz a2, .LBB61_887 +; CHECK-RV64-NEXT: j .LBB61_381 +; CHECK-RV64-NEXT: .LBB61_887: # %cond.load1477 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 371 +; CHECK-RV64-NEXT: li a3, 370 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 12 +; CHECK-RV64-NEXT: bltz a2, .LBB61_888 +; CHECK-RV64-NEXT: j .LBB61_382 +; CHECK-RV64-NEXT: .LBB61_888: # %cond.load1481 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 372 +; CHECK-RV64-NEXT: li a3, 371 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 11 +; CHECK-RV64-NEXT: bltz a2, .LBB61_889 +; CHECK-RV64-NEXT: j .LBB61_383 +; CHECK-RV64-NEXT: .LBB61_889: # %cond.load1485 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 373 +; CHECK-RV64-NEXT: li a3, 372 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 10 +; CHECK-RV64-NEXT: bltz a2, .LBB61_890 +; CHECK-RV64-NEXT: j .LBB61_384 +; CHECK-RV64-NEXT: .LBB61_890: # %cond.load1489 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 374 +; CHECK-RV64-NEXT: li a3, 373 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 9 +; CHECK-RV64-NEXT: bltz a2, .LBB61_891 +; CHECK-RV64-NEXT: j .LBB61_385 +; CHECK-RV64-NEXT: .LBB61_891: # %cond.load1493 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 375 +; CHECK-RV64-NEXT: li a3, 374 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 8 +; CHECK-RV64-NEXT: bltz a2, .LBB61_892 +; CHECK-RV64-NEXT: j .LBB61_386 +; CHECK-RV64-NEXT: .LBB61_892: # %cond.load1497 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 376 +; CHECK-RV64-NEXT: li a3, 375 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 7 +; CHECK-RV64-NEXT: bltz a2, .LBB61_893 +; CHECK-RV64-NEXT: j .LBB61_387 +; CHECK-RV64-NEXT: .LBB61_893: # %cond.load1501 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 377 +; CHECK-RV64-NEXT: li a3, 376 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 6 +; CHECK-RV64-NEXT: bltz a2, .LBB61_894 +; CHECK-RV64-NEXT: j .LBB61_388 +; CHECK-RV64-NEXT: .LBB61_894: # %cond.load1505 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 378 +; CHECK-RV64-NEXT: li a3, 377 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 5 +; CHECK-RV64-NEXT: bltz a2, .LBB61_895 +; CHECK-RV64-NEXT: j .LBB61_389 +; CHECK-RV64-NEXT: .LBB61_895: # %cond.load1509 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 379 +; CHECK-RV64-NEXT: li a3, 378 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 4 +; CHECK-RV64-NEXT: bltz a2, .LBB61_896 +; CHECK-RV64-NEXT: j .LBB61_390 +; CHECK-RV64-NEXT: .LBB61_896: # %cond.load1513 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 380 +; CHECK-RV64-NEXT: li a3, 379 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 3 +; CHECK-RV64-NEXT: bltz a2, .LBB61_897 +; CHECK-RV64-NEXT: j .LBB61_391 +; CHECK-RV64-NEXT: .LBB61_897: # %cond.load1517 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 381 +; CHECK-RV64-NEXT: li a3, 380 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 2 +; CHECK-RV64-NEXT: bgez a2, .LBB61_1030 +; CHECK-RV64-NEXT: j .LBB61_392 +; CHECK-RV64-NEXT: .LBB61_1030: # %cond.load1517 +; CHECK-RV64-NEXT: j .LBB61_393 +; CHECK-RV64-NEXT: .LBB61_898: # %cond.load1529 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 384 +; CHECK-RV64-NEXT: li a3, 383 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 1 +; CHECK-RV64-NEXT: bnez a1, .LBB61_899 +; CHECK-RV64-NEXT: j .LBB61_397 +; CHECK-RV64-NEXT: .LBB61_899: # %cond.load1533 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 385 +; CHECK-RV64-NEXT: li a3, 384 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 2 +; CHECK-RV64-NEXT: bnez a1, .LBB61_900 +; CHECK-RV64-NEXT: j .LBB61_398 +; CHECK-RV64-NEXT: .LBB61_900: # %cond.load1537 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 386 +; CHECK-RV64-NEXT: li a3, 385 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 4 +; CHECK-RV64-NEXT: bnez a1, .LBB61_901 +; CHECK-RV64-NEXT: j .LBB61_399 +; CHECK-RV64-NEXT: .LBB61_901: # %cond.load1541 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 387 +; CHECK-RV64-NEXT: li a3, 386 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 8 +; CHECK-RV64-NEXT: bnez a1, .LBB61_902 +; CHECK-RV64-NEXT: j .LBB61_400 +; CHECK-RV64-NEXT: .LBB61_902: # %cond.load1545 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 388 +; CHECK-RV64-NEXT: li a3, 387 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 16 +; CHECK-RV64-NEXT: bnez a1, .LBB61_903 +; CHECK-RV64-NEXT: j .LBB61_401 +; CHECK-RV64-NEXT: .LBB61_903: # %cond.load1549 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 389 +; CHECK-RV64-NEXT: li a3, 388 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 32 +; CHECK-RV64-NEXT: bnez a1, .LBB61_904 +; CHECK-RV64-NEXT: j .LBB61_402 +; CHECK-RV64-NEXT: .LBB61_904: # %cond.load1553 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 390 +; CHECK-RV64-NEXT: li a3, 389 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 64 +; CHECK-RV64-NEXT: bnez a1, .LBB61_905 +; CHECK-RV64-NEXT: j .LBB61_403 +; CHECK-RV64-NEXT: .LBB61_905: # %cond.load1557 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 391 +; CHECK-RV64-NEXT: li a3, 390 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 128 +; CHECK-RV64-NEXT: bnez a1, .LBB61_906 +; CHECK-RV64-NEXT: j .LBB61_404 +; CHECK-RV64-NEXT: .LBB61_906: # %cond.load1561 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 392 +; CHECK-RV64-NEXT: li a3, 391 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 256 +; CHECK-RV64-NEXT: bnez a1, .LBB61_907 +; CHECK-RV64-NEXT: j .LBB61_405 +; CHECK-RV64-NEXT: .LBB61_907: # %cond.load1565 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 393 +; CHECK-RV64-NEXT: li a3, 392 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 512 +; CHECK-RV64-NEXT: bnez a1, .LBB61_908 +; CHECK-RV64-NEXT: j .LBB61_406 +; CHECK-RV64-NEXT: .LBB61_908: # %cond.load1569 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 394 +; CHECK-RV64-NEXT: li a3, 393 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a1, a2, 1024 +; CHECK-RV64-NEXT: bnez a1, .LBB61_909 +; CHECK-RV64-NEXT: j .LBB61_407 +; CHECK-RV64-NEXT: .LBB61_909: # %cond.load1573 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 395 +; CHECK-RV64-NEXT: li a3, 394 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 52 +; CHECK-RV64-NEXT: bltz a1, .LBB61_910 +; CHECK-RV64-NEXT: j .LBB61_408 +; CHECK-RV64-NEXT: .LBB61_910: # %cond.load1577 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 396 +; CHECK-RV64-NEXT: li a3, 395 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 51 +; CHECK-RV64-NEXT: bltz a1, .LBB61_911 +; CHECK-RV64-NEXT: j .LBB61_409 +; CHECK-RV64-NEXT: .LBB61_911: # %cond.load1581 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 397 +; CHECK-RV64-NEXT: li a3, 396 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 50 +; CHECK-RV64-NEXT: bltz a1, .LBB61_912 +; CHECK-RV64-NEXT: j .LBB61_410 +; CHECK-RV64-NEXT: .LBB61_912: # %cond.load1585 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 398 +; CHECK-RV64-NEXT: li a3, 397 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 49 +; CHECK-RV64-NEXT: bltz a1, .LBB61_913 +; CHECK-RV64-NEXT: j .LBB61_411 +; CHECK-RV64-NEXT: .LBB61_913: # %cond.load1589 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 399 +; CHECK-RV64-NEXT: li a3, 398 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 48 +; CHECK-RV64-NEXT: bltz a1, .LBB61_914 +; CHECK-RV64-NEXT: j .LBB61_412 +; CHECK-RV64-NEXT: .LBB61_914: # %cond.load1593 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 400 +; CHECK-RV64-NEXT: li a3, 399 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 47 +; CHECK-RV64-NEXT: bltz a1, .LBB61_915 +; CHECK-RV64-NEXT: j .LBB61_413 +; CHECK-RV64-NEXT: .LBB61_915: # %cond.load1597 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 401 +; CHECK-RV64-NEXT: li a3, 400 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 46 +; CHECK-RV64-NEXT: bltz a1, .LBB61_916 +; CHECK-RV64-NEXT: j .LBB61_414 +; CHECK-RV64-NEXT: .LBB61_916: # %cond.load1601 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 402 +; CHECK-RV64-NEXT: li a3, 401 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 45 +; CHECK-RV64-NEXT: bltz a1, .LBB61_917 +; CHECK-RV64-NEXT: j .LBB61_415 +; CHECK-RV64-NEXT: .LBB61_917: # %cond.load1605 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 403 +; CHECK-RV64-NEXT: li a3, 402 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 44 +; CHECK-RV64-NEXT: bltz a1, .LBB61_918 +; CHECK-RV64-NEXT: j .LBB61_416 +; CHECK-RV64-NEXT: .LBB61_918: # %cond.load1609 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 404 +; CHECK-RV64-NEXT: li a3, 403 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 43 +; CHECK-RV64-NEXT: bltz a1, .LBB61_919 +; CHECK-RV64-NEXT: j .LBB61_417 +; CHECK-RV64-NEXT: .LBB61_919: # %cond.load1613 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 405 +; CHECK-RV64-NEXT: li a3, 404 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 42 +; CHECK-RV64-NEXT: bltz a1, .LBB61_920 +; CHECK-RV64-NEXT: j .LBB61_418 +; CHECK-RV64-NEXT: .LBB61_920: # %cond.load1617 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 406 +; CHECK-RV64-NEXT: li a3, 405 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 41 +; CHECK-RV64-NEXT: bltz a1, .LBB61_921 +; CHECK-RV64-NEXT: j .LBB61_419 +; CHECK-RV64-NEXT: .LBB61_921: # %cond.load1621 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 407 +; CHECK-RV64-NEXT: li a3, 406 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 40 +; CHECK-RV64-NEXT: bltz a1, .LBB61_922 +; CHECK-RV64-NEXT: j .LBB61_420 +; CHECK-RV64-NEXT: .LBB61_922: # %cond.load1625 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 408 +; CHECK-RV64-NEXT: li a3, 407 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 39 +; CHECK-RV64-NEXT: bltz a1, .LBB61_923 +; CHECK-RV64-NEXT: j .LBB61_421 +; CHECK-RV64-NEXT: .LBB61_923: # %cond.load1629 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 409 +; CHECK-RV64-NEXT: li a3, 408 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 38 +; CHECK-RV64-NEXT: bltz a1, .LBB61_924 +; CHECK-RV64-NEXT: j .LBB61_422 +; CHECK-RV64-NEXT: .LBB61_924: # %cond.load1633 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 410 +; CHECK-RV64-NEXT: li a3, 409 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 37 +; CHECK-RV64-NEXT: bltz a1, .LBB61_925 +; CHECK-RV64-NEXT: j .LBB61_423 +; CHECK-RV64-NEXT: .LBB61_925: # %cond.load1637 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 411 +; CHECK-RV64-NEXT: li a3, 410 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 36 +; CHECK-RV64-NEXT: bltz a1, .LBB61_926 +; CHECK-RV64-NEXT: j .LBB61_424 +; CHECK-RV64-NEXT: .LBB61_926: # %cond.load1641 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 412 +; CHECK-RV64-NEXT: li a3, 411 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 35 +; CHECK-RV64-NEXT: bltz a1, .LBB61_927 +; CHECK-RV64-NEXT: j .LBB61_425 +; CHECK-RV64-NEXT: .LBB61_927: # %cond.load1645 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 413 +; CHECK-RV64-NEXT: li a3, 412 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 34 +; CHECK-RV64-NEXT: bltz a1, .LBB61_928 +; CHECK-RV64-NEXT: j .LBB61_426 +; CHECK-RV64-NEXT: .LBB61_928: # %cond.load1649 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 414 +; CHECK-RV64-NEXT: li a3, 413 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 33 +; CHECK-RV64-NEXT: bltz a1, .LBB61_929 +; CHECK-RV64-NEXT: j .LBB61_427 +; CHECK-RV64-NEXT: .LBB61_929: # %cond.load1653 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 415 +; CHECK-RV64-NEXT: li a3, 414 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 32 +; CHECK-RV64-NEXT: bltz a1, .LBB61_930 +; CHECK-RV64-NEXT: j .LBB61_428 +; CHECK-RV64-NEXT: .LBB61_930: # %cond.load1657 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 416 +; CHECK-RV64-NEXT: li a3, 415 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 31 +; CHECK-RV64-NEXT: bltz a1, .LBB61_931 +; CHECK-RV64-NEXT: j .LBB61_429 +; CHECK-RV64-NEXT: .LBB61_931: # %cond.load1661 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 417 +; CHECK-RV64-NEXT: li a3, 416 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 30 +; CHECK-RV64-NEXT: bltz a1, .LBB61_932 +; CHECK-RV64-NEXT: j .LBB61_430 +; CHECK-RV64-NEXT: .LBB61_932: # %cond.load1665 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 418 +; CHECK-RV64-NEXT: li a3, 417 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 29 +; CHECK-RV64-NEXT: bltz a1, .LBB61_933 +; CHECK-RV64-NEXT: j .LBB61_431 +; CHECK-RV64-NEXT: .LBB61_933: # %cond.load1669 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 419 +; CHECK-RV64-NEXT: li a3, 418 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 28 +; CHECK-RV64-NEXT: bltz a1, .LBB61_934 +; CHECK-RV64-NEXT: j .LBB61_432 +; CHECK-RV64-NEXT: .LBB61_934: # %cond.load1673 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 420 +; CHECK-RV64-NEXT: li a3, 419 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 27 +; CHECK-RV64-NEXT: bltz a1, .LBB61_935 +; CHECK-RV64-NEXT: j .LBB61_433 +; CHECK-RV64-NEXT: .LBB61_935: # %cond.load1677 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 421 +; CHECK-RV64-NEXT: li a3, 420 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 26 +; CHECK-RV64-NEXT: bltz a1, .LBB61_936 +; CHECK-RV64-NEXT: j .LBB61_434 +; CHECK-RV64-NEXT: .LBB61_936: # %cond.load1681 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 422 +; CHECK-RV64-NEXT: li a3, 421 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 25 +; CHECK-RV64-NEXT: bltz a1, .LBB61_937 +; CHECK-RV64-NEXT: j .LBB61_435 +; CHECK-RV64-NEXT: .LBB61_937: # %cond.load1685 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 423 +; CHECK-RV64-NEXT: li a3, 422 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 24 +; CHECK-RV64-NEXT: bltz a1, .LBB61_938 +; CHECK-RV64-NEXT: j .LBB61_436 +; CHECK-RV64-NEXT: .LBB61_938: # %cond.load1689 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 424 +; CHECK-RV64-NEXT: li a3, 423 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 23 +; CHECK-RV64-NEXT: bltz a1, .LBB61_939 +; CHECK-RV64-NEXT: j .LBB61_437 +; CHECK-RV64-NEXT: .LBB61_939: # %cond.load1693 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 425 +; CHECK-RV64-NEXT: li a3, 424 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 22 +; CHECK-RV64-NEXT: bltz a1, .LBB61_940 +; CHECK-RV64-NEXT: j .LBB61_438 +; CHECK-RV64-NEXT: .LBB61_940: # %cond.load1697 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 426 +; CHECK-RV64-NEXT: li a3, 425 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 21 +; CHECK-RV64-NEXT: bltz a1, .LBB61_941 +; CHECK-RV64-NEXT: j .LBB61_439 +; CHECK-RV64-NEXT: .LBB61_941: # %cond.load1701 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 427 +; CHECK-RV64-NEXT: li a3, 426 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 20 +; CHECK-RV64-NEXT: bltz a1, .LBB61_942 +; CHECK-RV64-NEXT: j .LBB61_440 +; CHECK-RV64-NEXT: .LBB61_942: # %cond.load1705 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 428 +; CHECK-RV64-NEXT: li a3, 427 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 19 +; CHECK-RV64-NEXT: bltz a1, .LBB61_943 +; CHECK-RV64-NEXT: j .LBB61_441 +; CHECK-RV64-NEXT: .LBB61_943: # %cond.load1709 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 429 +; CHECK-RV64-NEXT: li a3, 428 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 18 +; CHECK-RV64-NEXT: bltz a1, .LBB61_944 +; CHECK-RV64-NEXT: j .LBB61_442 +; CHECK-RV64-NEXT: .LBB61_944: # %cond.load1713 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 430 +; CHECK-RV64-NEXT: li a3, 429 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 17 +; CHECK-RV64-NEXT: bltz a1, .LBB61_945 +; CHECK-RV64-NEXT: j .LBB61_443 +; CHECK-RV64-NEXT: .LBB61_945: # %cond.load1717 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 431 +; CHECK-RV64-NEXT: li a3, 430 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 16 +; CHECK-RV64-NEXT: bltz a1, .LBB61_946 +; CHECK-RV64-NEXT: j .LBB61_444 +; CHECK-RV64-NEXT: .LBB61_946: # %cond.load1721 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 432 +; CHECK-RV64-NEXT: li a3, 431 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 15 +; CHECK-RV64-NEXT: bltz a1, .LBB61_947 +; CHECK-RV64-NEXT: j .LBB61_445 +; CHECK-RV64-NEXT: .LBB61_947: # %cond.load1725 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 433 +; CHECK-RV64-NEXT: li a3, 432 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 14 +; CHECK-RV64-NEXT: bltz a1, .LBB61_948 +; CHECK-RV64-NEXT: j .LBB61_446 +; CHECK-RV64-NEXT: .LBB61_948: # %cond.load1729 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 434 +; CHECK-RV64-NEXT: li a3, 433 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 13 +; CHECK-RV64-NEXT: bltz a1, .LBB61_949 +; CHECK-RV64-NEXT: j .LBB61_447 +; CHECK-RV64-NEXT: .LBB61_949: # %cond.load1733 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 435 +; CHECK-RV64-NEXT: li a3, 434 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 12 +; CHECK-RV64-NEXT: bltz a1, .LBB61_950 +; CHECK-RV64-NEXT: j .LBB61_448 +; CHECK-RV64-NEXT: .LBB61_950: # %cond.load1737 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 436 +; CHECK-RV64-NEXT: li a3, 435 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 11 +; CHECK-RV64-NEXT: bltz a1, .LBB61_951 +; CHECK-RV64-NEXT: j .LBB61_449 +; CHECK-RV64-NEXT: .LBB61_951: # %cond.load1741 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 437 +; CHECK-RV64-NEXT: li a3, 436 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 10 +; CHECK-RV64-NEXT: bltz a1, .LBB61_952 +; CHECK-RV64-NEXT: j .LBB61_450 +; CHECK-RV64-NEXT: .LBB61_952: # %cond.load1745 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 438 +; CHECK-RV64-NEXT: li a3, 437 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 9 +; CHECK-RV64-NEXT: bltz a1, .LBB61_953 +; CHECK-RV64-NEXT: j .LBB61_451 +; CHECK-RV64-NEXT: .LBB61_953: # %cond.load1749 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 439 +; CHECK-RV64-NEXT: li a3, 438 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 8 +; CHECK-RV64-NEXT: bltz a1, .LBB61_954 +; CHECK-RV64-NEXT: j .LBB61_452 +; CHECK-RV64-NEXT: .LBB61_954: # %cond.load1753 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 440 +; CHECK-RV64-NEXT: li a3, 439 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 7 +; CHECK-RV64-NEXT: bltz a1, .LBB61_955 +; CHECK-RV64-NEXT: j .LBB61_453 +; CHECK-RV64-NEXT: .LBB61_955: # %cond.load1757 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 441 +; CHECK-RV64-NEXT: li a3, 440 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 6 +; CHECK-RV64-NEXT: bltz a1, .LBB61_956 +; CHECK-RV64-NEXT: j .LBB61_454 +; CHECK-RV64-NEXT: .LBB61_956: # %cond.load1761 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 442 +; CHECK-RV64-NEXT: li a3, 441 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 5 +; CHECK-RV64-NEXT: bltz a1, .LBB61_957 +; CHECK-RV64-NEXT: j .LBB61_455 +; CHECK-RV64-NEXT: .LBB61_957: # %cond.load1765 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 443 +; CHECK-RV64-NEXT: li a3, 442 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 4 +; CHECK-RV64-NEXT: bltz a1, .LBB61_958 +; CHECK-RV64-NEXT: j .LBB61_456 +; CHECK-RV64-NEXT: .LBB61_958: # %cond.load1769 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 444 +; CHECK-RV64-NEXT: li a3, 443 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 3 +; CHECK-RV64-NEXT: bltz a1, .LBB61_959 +; CHECK-RV64-NEXT: j .LBB61_457 +; CHECK-RV64-NEXT: .LBB61_959: # %cond.load1773 +; CHECK-RV64-NEXT: lbu a1, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a1 +; CHECK-RV64-NEXT: li a1, 445 +; CHECK-RV64-NEXT: li a3, 444 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a1, a2, 2 +; CHECK-RV64-NEXT: bgez a1, .LBB61_1031 +; CHECK-RV64-NEXT: j .LBB61_458 +; CHECK-RV64-NEXT: .LBB61_1031: # %cond.load1773 +; CHECK-RV64-NEXT: j .LBB61_459 +; CHECK-RV64-NEXT: .LBB61_960: # %cond.load1785 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 448 +; CHECK-RV64-NEXT: li a3, 447 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 1 +; CHECK-RV64-NEXT: bnez a2, .LBB61_961 +; CHECK-RV64-NEXT: j .LBB61_463 +; CHECK-RV64-NEXT: .LBB61_961: # %cond.load1789 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 449 +; CHECK-RV64-NEXT: li a3, 448 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 2 +; CHECK-RV64-NEXT: bnez a2, .LBB61_962 +; CHECK-RV64-NEXT: j .LBB61_464 +; CHECK-RV64-NEXT: .LBB61_962: # %cond.load1793 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 450 +; CHECK-RV64-NEXT: li a3, 449 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 4 +; CHECK-RV64-NEXT: bnez a2, .LBB61_963 +; CHECK-RV64-NEXT: j .LBB61_465 +; CHECK-RV64-NEXT: .LBB61_963: # %cond.load1797 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 451 +; CHECK-RV64-NEXT: li a3, 450 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 8 +; CHECK-RV64-NEXT: bnez a2, .LBB61_964 +; CHECK-RV64-NEXT: j .LBB61_466 +; CHECK-RV64-NEXT: .LBB61_964: # %cond.load1801 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 452 +; CHECK-RV64-NEXT: li a3, 451 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 16 +; CHECK-RV64-NEXT: bnez a2, .LBB61_965 +; CHECK-RV64-NEXT: j .LBB61_467 +; CHECK-RV64-NEXT: .LBB61_965: # %cond.load1805 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 453 +; CHECK-RV64-NEXT: li a3, 452 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 32 +; CHECK-RV64-NEXT: bnez a2, .LBB61_966 +; CHECK-RV64-NEXT: j .LBB61_468 +; CHECK-RV64-NEXT: .LBB61_966: # %cond.load1809 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 454 +; CHECK-RV64-NEXT: li a3, 453 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 64 +; CHECK-RV64-NEXT: bnez a2, .LBB61_967 +; CHECK-RV64-NEXT: j .LBB61_469 +; CHECK-RV64-NEXT: .LBB61_967: # %cond.load1813 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 455 +; CHECK-RV64-NEXT: li a3, 454 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 128 +; CHECK-RV64-NEXT: bnez a2, .LBB61_968 +; CHECK-RV64-NEXT: j .LBB61_470 +; CHECK-RV64-NEXT: .LBB61_968: # %cond.load1817 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 456 +; CHECK-RV64-NEXT: li a3, 455 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 256 +; CHECK-RV64-NEXT: bnez a2, .LBB61_969 +; CHECK-RV64-NEXT: j .LBB61_471 +; CHECK-RV64-NEXT: .LBB61_969: # %cond.load1821 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 457 +; CHECK-RV64-NEXT: li a3, 456 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 512 +; CHECK-RV64-NEXT: bnez a2, .LBB61_970 +; CHECK-RV64-NEXT: j .LBB61_472 +; CHECK-RV64-NEXT: .LBB61_970: # %cond.load1825 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 458 +; CHECK-RV64-NEXT: li a3, 457 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: andi a2, a1, 1024 +; CHECK-RV64-NEXT: bnez a2, .LBB61_971 +; CHECK-RV64-NEXT: j .LBB61_473 +; CHECK-RV64-NEXT: .LBB61_971: # %cond.load1829 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 459 +; CHECK-RV64-NEXT: li a3, 458 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 52 +; CHECK-RV64-NEXT: bltz a2, .LBB61_972 +; CHECK-RV64-NEXT: j .LBB61_474 +; CHECK-RV64-NEXT: .LBB61_972: # %cond.load1833 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 460 +; CHECK-RV64-NEXT: li a3, 459 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 51 +; CHECK-RV64-NEXT: bltz a2, .LBB61_973 +; CHECK-RV64-NEXT: j .LBB61_475 +; CHECK-RV64-NEXT: .LBB61_973: # %cond.load1837 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 461 +; CHECK-RV64-NEXT: li a3, 460 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 50 +; CHECK-RV64-NEXT: bltz a2, .LBB61_974 +; CHECK-RV64-NEXT: j .LBB61_476 +; CHECK-RV64-NEXT: .LBB61_974: # %cond.load1841 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 462 +; CHECK-RV64-NEXT: li a3, 461 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 49 +; CHECK-RV64-NEXT: bltz a2, .LBB61_975 +; CHECK-RV64-NEXT: j .LBB61_477 +; CHECK-RV64-NEXT: .LBB61_975: # %cond.load1845 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 463 +; CHECK-RV64-NEXT: li a3, 462 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 48 +; CHECK-RV64-NEXT: bltz a2, .LBB61_976 +; CHECK-RV64-NEXT: j .LBB61_478 +; CHECK-RV64-NEXT: .LBB61_976: # %cond.load1849 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 464 +; CHECK-RV64-NEXT: li a3, 463 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 47 +; CHECK-RV64-NEXT: bltz a2, .LBB61_977 +; CHECK-RV64-NEXT: j .LBB61_479 +; CHECK-RV64-NEXT: .LBB61_977: # %cond.load1853 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 465 +; CHECK-RV64-NEXT: li a3, 464 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 46 +; CHECK-RV64-NEXT: bltz a2, .LBB61_978 +; CHECK-RV64-NEXT: j .LBB61_480 +; CHECK-RV64-NEXT: .LBB61_978: # %cond.load1857 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 466 +; CHECK-RV64-NEXT: li a3, 465 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 45 +; CHECK-RV64-NEXT: bltz a2, .LBB61_979 +; CHECK-RV64-NEXT: j .LBB61_481 +; CHECK-RV64-NEXT: .LBB61_979: # %cond.load1861 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 467 +; CHECK-RV64-NEXT: li a3, 466 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 44 +; CHECK-RV64-NEXT: bltz a2, .LBB61_980 +; CHECK-RV64-NEXT: j .LBB61_482 +; CHECK-RV64-NEXT: .LBB61_980: # %cond.load1865 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 468 +; CHECK-RV64-NEXT: li a3, 467 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 43 +; CHECK-RV64-NEXT: bltz a2, .LBB61_981 +; CHECK-RV64-NEXT: j .LBB61_483 +; CHECK-RV64-NEXT: .LBB61_981: # %cond.load1869 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 469 +; CHECK-RV64-NEXT: li a3, 468 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 42 +; CHECK-RV64-NEXT: bltz a2, .LBB61_982 +; CHECK-RV64-NEXT: j .LBB61_484 +; CHECK-RV64-NEXT: .LBB61_982: # %cond.load1873 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 470 +; CHECK-RV64-NEXT: li a3, 469 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 41 +; CHECK-RV64-NEXT: bltz a2, .LBB61_983 +; CHECK-RV64-NEXT: j .LBB61_485 +; CHECK-RV64-NEXT: .LBB61_983: # %cond.load1877 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 471 +; CHECK-RV64-NEXT: li a3, 470 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 40 +; CHECK-RV64-NEXT: bltz a2, .LBB61_984 +; CHECK-RV64-NEXT: j .LBB61_486 +; CHECK-RV64-NEXT: .LBB61_984: # %cond.load1881 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 472 +; CHECK-RV64-NEXT: li a3, 471 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 39 +; CHECK-RV64-NEXT: bltz a2, .LBB61_985 +; CHECK-RV64-NEXT: j .LBB61_487 +; CHECK-RV64-NEXT: .LBB61_985: # %cond.load1885 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 473 +; CHECK-RV64-NEXT: li a3, 472 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 38 +; CHECK-RV64-NEXT: bltz a2, .LBB61_986 +; CHECK-RV64-NEXT: j .LBB61_488 +; CHECK-RV64-NEXT: .LBB61_986: # %cond.load1889 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 474 +; CHECK-RV64-NEXT: li a3, 473 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 37 +; CHECK-RV64-NEXT: bltz a2, .LBB61_987 +; CHECK-RV64-NEXT: j .LBB61_489 +; CHECK-RV64-NEXT: .LBB61_987: # %cond.load1893 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 475 +; CHECK-RV64-NEXT: li a3, 474 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 36 +; CHECK-RV64-NEXT: bltz a2, .LBB61_988 +; CHECK-RV64-NEXT: j .LBB61_490 +; CHECK-RV64-NEXT: .LBB61_988: # %cond.load1897 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 476 +; CHECK-RV64-NEXT: li a3, 475 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 35 +; CHECK-RV64-NEXT: bltz a2, .LBB61_989 +; CHECK-RV64-NEXT: j .LBB61_491 +; CHECK-RV64-NEXT: .LBB61_989: # %cond.load1901 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 477 +; CHECK-RV64-NEXT: li a3, 476 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 34 +; CHECK-RV64-NEXT: bltz a2, .LBB61_990 +; CHECK-RV64-NEXT: j .LBB61_492 +; CHECK-RV64-NEXT: .LBB61_990: # %cond.load1905 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 478 +; CHECK-RV64-NEXT: li a3, 477 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 33 +; CHECK-RV64-NEXT: bltz a2, .LBB61_991 +; CHECK-RV64-NEXT: j .LBB61_493 +; CHECK-RV64-NEXT: .LBB61_991: # %cond.load1909 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 479 +; CHECK-RV64-NEXT: li a3, 478 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 32 +; CHECK-RV64-NEXT: bltz a2, .LBB61_992 +; CHECK-RV64-NEXT: j .LBB61_494 +; CHECK-RV64-NEXT: .LBB61_992: # %cond.load1913 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 480 +; CHECK-RV64-NEXT: li a3, 479 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 31 +; CHECK-RV64-NEXT: bltz a2, .LBB61_993 +; CHECK-RV64-NEXT: j .LBB61_495 +; CHECK-RV64-NEXT: .LBB61_993: # %cond.load1917 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 481 +; CHECK-RV64-NEXT: li a3, 480 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 30 +; CHECK-RV64-NEXT: bltz a2, .LBB61_994 +; CHECK-RV64-NEXT: j .LBB61_496 +; CHECK-RV64-NEXT: .LBB61_994: # %cond.load1921 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 482 +; CHECK-RV64-NEXT: li a3, 481 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 29 +; CHECK-RV64-NEXT: bltz a2, .LBB61_995 +; CHECK-RV64-NEXT: j .LBB61_497 +; CHECK-RV64-NEXT: .LBB61_995: # %cond.load1925 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 483 +; CHECK-RV64-NEXT: li a3, 482 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 28 +; CHECK-RV64-NEXT: bltz a2, .LBB61_996 +; CHECK-RV64-NEXT: j .LBB61_498 +; CHECK-RV64-NEXT: .LBB61_996: # %cond.load1929 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 484 +; CHECK-RV64-NEXT: li a3, 483 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 27 +; CHECK-RV64-NEXT: bltz a2, .LBB61_997 +; CHECK-RV64-NEXT: j .LBB61_499 +; CHECK-RV64-NEXT: .LBB61_997: # %cond.load1933 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 485 +; CHECK-RV64-NEXT: li a3, 484 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 26 +; CHECK-RV64-NEXT: bltz a2, .LBB61_998 +; CHECK-RV64-NEXT: j .LBB61_500 +; CHECK-RV64-NEXT: .LBB61_998: # %cond.load1937 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 486 +; CHECK-RV64-NEXT: li a3, 485 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 25 +; CHECK-RV64-NEXT: bltz a2, .LBB61_999 +; CHECK-RV64-NEXT: j .LBB61_501 +; CHECK-RV64-NEXT: .LBB61_999: # %cond.load1941 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 487 +; CHECK-RV64-NEXT: li a3, 486 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 24 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1000 +; CHECK-RV64-NEXT: j .LBB61_502 +; CHECK-RV64-NEXT: .LBB61_1000: # %cond.load1945 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 488 +; CHECK-RV64-NEXT: li a3, 487 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 23 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1001 +; CHECK-RV64-NEXT: j .LBB61_503 +; CHECK-RV64-NEXT: .LBB61_1001: # %cond.load1949 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 489 +; CHECK-RV64-NEXT: li a3, 488 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 22 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1002 +; CHECK-RV64-NEXT: j .LBB61_504 +; CHECK-RV64-NEXT: .LBB61_1002: # %cond.load1953 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 490 +; CHECK-RV64-NEXT: li a3, 489 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 21 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1003 +; CHECK-RV64-NEXT: j .LBB61_505 +; CHECK-RV64-NEXT: .LBB61_1003: # %cond.load1957 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 491 +; CHECK-RV64-NEXT: li a3, 490 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 20 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1004 +; CHECK-RV64-NEXT: j .LBB61_506 +; CHECK-RV64-NEXT: .LBB61_1004: # %cond.load1961 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 492 +; CHECK-RV64-NEXT: li a3, 491 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 19 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1005 +; CHECK-RV64-NEXT: j .LBB61_507 +; CHECK-RV64-NEXT: .LBB61_1005: # %cond.load1965 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 493 +; CHECK-RV64-NEXT: li a3, 492 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 18 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1006 +; CHECK-RV64-NEXT: j .LBB61_508 +; CHECK-RV64-NEXT: .LBB61_1006: # %cond.load1969 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 494 +; CHECK-RV64-NEXT: li a3, 493 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 17 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1007 +; CHECK-RV64-NEXT: j .LBB61_509 +; CHECK-RV64-NEXT: .LBB61_1007: # %cond.load1973 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 495 +; CHECK-RV64-NEXT: li a3, 494 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 16 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1008 +; CHECK-RV64-NEXT: j .LBB61_510 +; CHECK-RV64-NEXT: .LBB61_1008: # %cond.load1977 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 496 +; CHECK-RV64-NEXT: li a3, 495 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 15 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1009 +; CHECK-RV64-NEXT: j .LBB61_511 +; CHECK-RV64-NEXT: .LBB61_1009: # %cond.load1981 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 497 +; CHECK-RV64-NEXT: li a3, 496 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 14 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1010 +; CHECK-RV64-NEXT: j .LBB61_512 +; CHECK-RV64-NEXT: .LBB61_1010: # %cond.load1985 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 498 +; CHECK-RV64-NEXT: li a3, 497 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 13 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1011 +; CHECK-RV64-NEXT: j .LBB61_513 +; CHECK-RV64-NEXT: .LBB61_1011: # %cond.load1989 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 499 +; CHECK-RV64-NEXT: li a3, 498 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 12 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1012 +; CHECK-RV64-NEXT: j .LBB61_514 +; CHECK-RV64-NEXT: .LBB61_1012: # %cond.load1993 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 500 +; CHECK-RV64-NEXT: li a3, 499 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 11 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1013 +; CHECK-RV64-NEXT: j .LBB61_515 +; CHECK-RV64-NEXT: .LBB61_1013: # %cond.load1997 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 501 +; CHECK-RV64-NEXT: li a3, 500 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 10 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1014 +; CHECK-RV64-NEXT: j .LBB61_516 +; CHECK-RV64-NEXT: .LBB61_1014: # %cond.load2001 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 502 +; CHECK-RV64-NEXT: li a3, 501 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 9 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1015 +; CHECK-RV64-NEXT: j .LBB61_517 +; CHECK-RV64-NEXT: .LBB61_1015: # %cond.load2005 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 503 +; CHECK-RV64-NEXT: li a3, 502 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 8 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1016 +; CHECK-RV64-NEXT: j .LBB61_518 +; CHECK-RV64-NEXT: .LBB61_1016: # %cond.load2009 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 504 +; CHECK-RV64-NEXT: li a3, 503 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 7 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1017 +; CHECK-RV64-NEXT: j .LBB61_519 +; CHECK-RV64-NEXT: .LBB61_1017: # %cond.load2013 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 505 +; CHECK-RV64-NEXT: li a3, 504 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 6 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1018 +; CHECK-RV64-NEXT: j .LBB61_520 +; CHECK-RV64-NEXT: .LBB61_1018: # %cond.load2017 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 506 +; CHECK-RV64-NEXT: li a3, 505 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 5 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1019 +; CHECK-RV64-NEXT: j .LBB61_521 +; CHECK-RV64-NEXT: .LBB61_1019: # %cond.load2021 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 507 +; CHECK-RV64-NEXT: li a3, 506 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 4 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1020 +; CHECK-RV64-NEXT: j .LBB61_522 +; CHECK-RV64-NEXT: .LBB61_1020: # %cond.load2025 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 508 +; CHECK-RV64-NEXT: li a3, 507 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 3 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1021 +; CHECK-RV64-NEXT: j .LBB61_523 +; CHECK-RV64-NEXT: .LBB61_1021: # %cond.load2029 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 509 +; CHECK-RV64-NEXT: li a3, 508 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 2 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1022 +; CHECK-RV64-NEXT: j .LBB61_524 +; CHECK-RV64-NEXT: .LBB61_1022: # %cond.load2033 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 510 +; CHECK-RV64-NEXT: li a3, 509 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: slli a2, a1, 1 +; CHECK-RV64-NEXT: bltz a2, .LBB61_1023 +; CHECK-RV64-NEXT: j .LBB61_525 +; CHECK-RV64-NEXT: .LBB61_1023: # %cond.load2037 +; CHECK-RV64-NEXT: lbu a2, 0(a0) +; CHECK-RV64-NEXT: li a3, 512 +; CHECK-RV64-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a2 +; CHECK-RV64-NEXT: li a2, 511 +; CHECK-RV64-NEXT: li a3, 510 +; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m8, tu, ma +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a3 +; CHECK-RV64-NEXT: addi a0, a0, 1 +; CHECK-RV64-NEXT: bltz a1, .LBB61_1024 +; CHECK-RV64-NEXT: j .LBB61_526 +; CHECK-RV64-NEXT: .LBB61_1024: # %cond.load2041 +; CHECK-RV64-NEXT: lbu a0, 0(a0) +; CHECK-RV64-NEXT: li a1, 512 +; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-RV64-NEXT: vmv.s.x v16, a0 +; CHECK-RV64-NEXT: li a0, 511 +; CHECK-RV64-NEXT: vslideup.vx v8, v16, a0 +; CHECK-RV64-NEXT: ret + %res = call <512 x i8> @llvm.masked.expandload.v512i8(ptr align 1 %base, <512 x i1> %mask, <512 x i8> %passthru) + ret <512 x i8> %res +} diff --git a/llvm/test/CodeGen/RISCV/rvv/extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/extload-truncstore.ll index dfc70299d015b..63fd13d98c7ad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extload-truncstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extload-truncstore.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s define @sextload_nxv1i1_nxv1i8(ptr %x) { ; CHECK-LABEL: sextload_nxv1i1_nxv1i8: diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll index 63fa87852b646..8e9751502460e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple riscv32 -mattr=+m,+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple riscv32 -mattr=+m,+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple riscv64 -mattr=+m,+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s define @extract_nxv8i32_nxv4i32_0( %vec) { ; CHECK-LABEL: extract_nxv8i32_nxv4i32_0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll index d8781495abd75..4e549a5aa7c3a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @ceil_nxv1f16( %x) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll index ee16b476dc84e..35936574e8fe2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll index 1df452d8641c5..f6b47743d1154 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @floor_nxv1f16( %x) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll index 00e21ce8992b0..d26b74c7c139e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll index 29489be4dcb56..327b168ffe6b5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32 -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64 define <1 x i1> @v1i1(i1 %x, i1 %y) { ; CHECK-LABEL: v1i1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll index f124d550df16d..9ad1d7167c6a0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \ +; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh -verify-machineinstrs \ ; RUN: -target-abi=ilp32d < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh -verify-machineinstrs \ ; RUN: -target-abi=lp64d < %s | FileCheck %s --check-prefixes=CHECK,RV64 -; RUN: llc -mtriple=riscv32 -mattr=+zve32f,+zvl128b,+d,+zfh,+zvfh \ +; RUN: llc -mtriple=riscv32 -mattr=+zve32f,+zvl128b,+d,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d < %s | FileCheck %s \ ; RUN: --check-prefixes=ELEN32,RV32ELEN32 -; RUN: llc -mtriple=riscv64 -mattr=+zve32f,+zvl128b,+d,+zfh,+zvfh \ +; RUN: llc -mtriple=riscv64 -mattr=+zve32f,+zvl128b,+d,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d < %s | FileCheck %s \ ; RUN: --check-prefixes=ELEN32,RV64ELEN32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll index befbfb88550ba..f7840be8f0c65 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.ceil.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll index 36fbdd8e0664f..9d42f2b6adeed 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 declare void @llvm.masked.compressstore.v1f16(<1 x half>, ptr, <1 x i1>) define void @compressstore_v1f16(ptr %base, <1 x half> %v, <1 x i1> %mask) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index 9f8de22b25c2d..1d7496397670f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s ; Integers diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll index 8b31166e313de..fa311154fa973 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll @@ -1,1125 +1,199 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 declare <1 x half> @llvm.masked.expandload.v1f16(ptr, <1 x i1>, <1 x half>) define <1 x half> @expandload_v1f16(ptr %base, <1 x half> %src0, <1 x i1> %mask) { -; RV32-LABEL: expandload_v1f16: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vfirst.m a1, v0 -; RV32-NEXT: bnez a1, .LBB0_2 -; RV32-NEXT: # %bb.1: # %cond.load -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vle16.v v8, (a0) -; RV32-NEXT: .LBB0_2: # %else -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v1f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vfirst.m a1, v0 -; RV64-NEXT: bnez a1, .LBB0_2 -; RV64-NEXT: # %bb.1: # %cond.load -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV64-NEXT: vle16.v v8, (a0) -; RV64-NEXT: .LBB0_2: # %else -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <1 x half> @llvm.masked.expandload.v1f16(ptr align 2 %base, <1 x i1> %mask, <1 x half> %src0) ret <1 x half>%res } declare <2 x half> @llvm.masked.expandload.v2f16(ptr, <2 x i1>, <2 x half>) define <2 x half> @expandload_v2f16(ptr %base, <2 x half> %src0, <2 x i1> %mask) { -; RV32-LABEL: expandload_v2f16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB1_3 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: bnez a1, .LBB1_4 -; RV32-NEXT: .LBB1_2: # %else2 -; RV32-NEXT: ret -; RV32-NEXT: .LBB1_3: # %cond.load -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: beqz a1, .LBB1_2 -; RV32-NEXT: .LBB1_4: # %cond.load1 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; RV32-NEXT: vslideup.vi v8, v9, 1 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v2f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB1_3 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: bnez a1, .LBB1_4 -; RV64-NEXT: .LBB1_2: # %else2 -; RV64-NEXT: ret -; RV64-NEXT: .LBB1_3: # %cond.load -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: beqz a1, .LBB1_2 -; RV64-NEXT: .LBB1_4: # %cond.load1 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; RV64-NEXT: vslideup.vi v8, v9, 1 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <2 x half> @llvm.masked.expandload.v2f16(ptr align 2 %base, <2 x i1> %mask, <2 x half> %src0) ret <2 x half>%res } declare <4 x half> @llvm.masked.expandload.v4f16(ptr, <4 x i1>, <4 x half>) define <4 x half> @expandload_v4f16(ptr %base, <4 x half> %src0, <4 x i1> %mask) { -; RV32-LABEL: expandload_v4f16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB2_5 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB2_6 -; RV32-NEXT: .LBB2_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB2_7 -; RV32-NEXT: .LBB2_3: # %else6 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: bnez a1, .LBB2_8 -; RV32-NEXT: .LBB2_4: # %else10 -; RV32-NEXT: ret -; RV32-NEXT: .LBB2_5: # %cond.load -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB2_2 -; RV32-NEXT: .LBB2_6: # %cond.load1 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; RV32-NEXT: vslideup.vi v8, v9, 1 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB2_3 -; RV32-NEXT: .LBB2_7: # %cond.load5 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 2 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: beqz a1, .LBB2_4 -; RV32-NEXT: .LBB2_8: # %cond.load9 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 3 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v4f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB2_5 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB2_6 -; RV64-NEXT: .LBB2_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB2_7 -; RV64-NEXT: .LBB2_3: # %else6 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: bnez a1, .LBB2_8 -; RV64-NEXT: .LBB2_4: # %else10 -; RV64-NEXT: ret -; RV64-NEXT: .LBB2_5: # %cond.load -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB2_2 -; RV64-NEXT: .LBB2_6: # %cond.load1 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; RV64-NEXT: vslideup.vi v8, v9, 1 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB2_3 -; RV64-NEXT: .LBB2_7: # %cond.load5 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 2 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: beqz a1, .LBB2_4 -; RV64-NEXT: .LBB2_8: # %cond.load9 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 3 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <4 x half> @llvm.masked.expandload.v4f16(ptr align 2 %base, <4 x i1> %mask, <4 x half> %src0) ret <4 x half>%res } declare <8 x half> @llvm.masked.expandload.v8f16(ptr, <8 x i1>, <8 x half>) define <8 x half> @expandload_v8f16(ptr %base, <8 x half> %src0, <8 x i1> %mask) { -; RV32-LABEL: expandload_v8f16: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB3_9 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB3_10 -; RV32-NEXT: .LBB3_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB3_11 -; RV32-NEXT: .LBB3_3: # %else6 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: bnez a2, .LBB3_12 -; RV32-NEXT: .LBB3_4: # %else10 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: bnez a2, .LBB3_13 -; RV32-NEXT: .LBB3_5: # %else14 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: bnez a2, .LBB3_14 -; RV32-NEXT: .LBB3_6: # %else18 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: bnez a2, .LBB3_15 -; RV32-NEXT: .LBB3_7: # %else22 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: bnez a1, .LBB3_16 -; RV32-NEXT: .LBB3_8: # %else26 -; RV32-NEXT: ret -; RV32-NEXT: .LBB3_9: # %cond.load -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB3_2 -; RV32-NEXT: .LBB3_10: # %cond.load1 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v9, 1 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB3_3 -; RV32-NEXT: .LBB3_11: # %cond.load5 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 2 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: beqz a2, .LBB3_4 -; RV32-NEXT: .LBB3_12: # %cond.load9 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 3 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: beqz a2, .LBB3_5 -; RV32-NEXT: .LBB3_13: # %cond.load13 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 4 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: beqz a2, .LBB3_6 -; RV32-NEXT: .LBB3_14: # %cond.load17 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 5 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: beqz a2, .LBB3_7 -; RV32-NEXT: .LBB3_15: # %cond.load21 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 7, e16, m1, tu, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 6 -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: beqz a1, .LBB3_8 -; RV32-NEXT: .LBB3_16: # %cond.load25 -; RV32-NEXT: flh fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 7 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v8f16: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB3_9 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB3_10 -; RV64-NEXT: .LBB3_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB3_11 -; RV64-NEXT: .LBB3_3: # %else6 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: bnez a2, .LBB3_12 -; RV64-NEXT: .LBB3_4: # %else10 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: bnez a2, .LBB3_13 -; RV64-NEXT: .LBB3_5: # %else14 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: bnez a2, .LBB3_14 -; RV64-NEXT: .LBB3_6: # %else18 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: bnez a2, .LBB3_15 -; RV64-NEXT: .LBB3_7: # %else22 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: bnez a1, .LBB3_16 -; RV64-NEXT: .LBB3_8: # %else26 -; RV64-NEXT: ret -; RV64-NEXT: .LBB3_9: # %cond.load -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB3_2 -; RV64-NEXT: .LBB3_10: # %cond.load1 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV64-NEXT: vslideup.vi v8, v9, 1 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB3_3 -; RV64-NEXT: .LBB3_11: # %cond.load5 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 2 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: beqz a2, .LBB3_4 -; RV64-NEXT: .LBB3_12: # %cond.load9 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 3 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: beqz a2, .LBB3_5 -; RV64-NEXT: .LBB3_13: # %cond.load13 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 4 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: beqz a2, .LBB3_6 -; RV64-NEXT: .LBB3_14: # %cond.load17 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 5 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: beqz a2, .LBB3_7 -; RV64-NEXT: .LBB3_15: # %cond.load21 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 7, e16, m1, tu, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 6 -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: beqz a1, .LBB3_8 -; RV64-NEXT: .LBB3_16: # %cond.load25 -; RV64-NEXT: flh fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 7 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <8 x half> @llvm.masked.expandload.v8f16(ptr align 2 %base, <8 x i1> %mask, <8 x half> %src0) ret <8 x half>%res } declare <1 x float> @llvm.masked.expandload.v1f32(ptr, <1 x i1>, <1 x float>) define <1 x float> @expandload_v1f32(ptr %base, <1 x float> %src0, <1 x i1> %mask) { -; RV32-LABEL: expandload_v1f32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vfirst.m a1, v0 -; RV32-NEXT: bnez a1, .LBB4_2 -; RV32-NEXT: # %bb.1: # %cond.load -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: .LBB4_2: # %else -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v1f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vfirst.m a1, v0 -; RV64-NEXT: bnez a1, .LBB4_2 -; RV64-NEXT: # %bb.1: # %cond.load -; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: .LBB4_2: # %else -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <1 x float> @llvm.masked.expandload.v1f32(ptr align 4 %base, <1 x i1> %mask, <1 x float> %src0) ret <1 x float>%res } declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>) define <2 x float> @expandload_v2f32(ptr %base, <2 x float> %src0, <2 x i1> %mask) { -; RV32-LABEL: expandload_v2f32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB5_3 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: bnez a1, .LBB5_4 -; RV32-NEXT: .LBB5_2: # %else2 -; RV32-NEXT: ret -; RV32-NEXT: .LBB5_3: # %cond.load -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: beqz a1, .LBB5_2 -; RV32-NEXT: .LBB5_4: # %cond.load1 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vslideup.vi v8, v9, 1 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v2f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB5_3 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: bnez a1, .LBB5_4 -; RV64-NEXT: .LBB5_2: # %else2 -; RV64-NEXT: ret -; RV64-NEXT: .LBB5_3: # %cond.load -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: beqz a1, .LBB5_2 -; RV64-NEXT: .LBB5_4: # %cond.load1 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vslideup.vi v8, v9, 1 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <2 x float> @llvm.masked.expandload.v2f32(ptr align 4 %base, <2 x i1> %mask, <2 x float> %src0) ret <2 x float>%res } declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>) define <4 x float> @expandload_v4f32(ptr %base, <4 x float> %src0, <4 x i1> %mask) { -; RV32-LABEL: expandload_v4f32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB6_5 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB6_6 -; RV32-NEXT: .LBB6_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB6_7 -; RV32-NEXT: .LBB6_3: # %else6 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: bnez a1, .LBB6_8 -; RV32-NEXT: .LBB6_4: # %else10 -; RV32-NEXT: ret -; RV32-NEXT: .LBB6_5: # %cond.load -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB6_2 -; RV32-NEXT: .LBB6_6: # %cond.load1 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v9, 1 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB6_3 -; RV32-NEXT: .LBB6_7: # %cond.load5 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 2 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: beqz a1, .LBB6_4 -; RV32-NEXT: .LBB6_8: # %cond.load9 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vslideup.vi v8, v9, 3 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v4f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB6_5 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB6_6 -; RV64-NEXT: .LBB6_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB6_7 -; RV64-NEXT: .LBB6_3: # %else6 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: bnez a1, .LBB6_8 -; RV64-NEXT: .LBB6_4: # %else10 -; RV64-NEXT: ret -; RV64-NEXT: .LBB6_5: # %cond.load -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB6_2 -; RV64-NEXT: .LBB6_6: # %cond.load1 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64-NEXT: vslideup.vi v8, v9, 1 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB6_3 -; RV64-NEXT: .LBB6_7: # %cond.load5 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 2 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: beqz a1, .LBB6_4 -; RV64-NEXT: .LBB6_8: # %cond.load9 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vslideup.vi v8, v9, 3 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <4 x float> @llvm.masked.expandload.v4f32(ptr align 4 %base, <4 x i1> %mask, <4 x float> %src0) ret <4 x float>%res } declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>) define <8 x float> @expandload_v8f32(ptr %base, <8 x float> %src0, <8 x i1> %mask) { -; RV32-LABEL: expandload_v8f32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB7_9 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB7_10 -; RV32-NEXT: .LBB7_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB7_11 -; RV32-NEXT: .LBB7_3: # %else6 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: bnez a2, .LBB7_12 -; RV32-NEXT: .LBB7_4: # %else10 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: bnez a2, .LBB7_13 -; RV32-NEXT: .LBB7_5: # %else14 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: bnez a2, .LBB7_14 -; RV32-NEXT: .LBB7_6: # %else18 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: bnez a2, .LBB7_15 -; RV32-NEXT: .LBB7_7: # %else22 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: bnez a1, .LBB7_16 -; RV32-NEXT: .LBB7_8: # %else26 -; RV32-NEXT: ret -; RV32-NEXT: .LBB7_9: # %cond.load -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB7_2 -; RV32-NEXT: .LBB7_10: # %cond.load1 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 1 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB7_3 -; RV32-NEXT: .LBB7_11: # %cond.load5 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vslideup.vi v8, v10, 2 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: beqz a2, .LBB7_4 -; RV32-NEXT: .LBB7_12: # %cond.load9 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vslideup.vi v8, v10, 3 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: beqz a2, .LBB7_5 -; RV32-NEXT: .LBB7_13: # %cond.load13 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vslideup.vi v8, v10, 4 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: beqz a2, .LBB7_6 -; RV32-NEXT: .LBB7_14: # %cond.load17 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vslideup.vi v8, v10, 5 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: beqz a2, .LBB7_7 -; RV32-NEXT: .LBB7_15: # %cond.load21 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vslideup.vi v8, v10, 6 -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: beqz a1, .LBB7_8 -; RV32-NEXT: .LBB7_16: # %cond.load25 -; RV32-NEXT: flw fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vslideup.vi v8, v10, 7 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v8f32: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB7_9 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB7_10 -; RV64-NEXT: .LBB7_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB7_11 -; RV64-NEXT: .LBB7_3: # %else6 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: bnez a2, .LBB7_12 -; RV64-NEXT: .LBB7_4: # %else10 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: bnez a2, .LBB7_13 -; RV64-NEXT: .LBB7_5: # %else14 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: bnez a2, .LBB7_14 -; RV64-NEXT: .LBB7_6: # %else18 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: bnez a2, .LBB7_15 -; RV64-NEXT: .LBB7_7: # %else22 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: bnez a1, .LBB7_16 -; RV64-NEXT: .LBB7_8: # %else26 -; RV64-NEXT: ret -; RV64-NEXT: .LBB7_9: # %cond.load -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB7_2 -; RV64-NEXT: .LBB7_10: # %cond.load1 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 1 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB7_3 -; RV64-NEXT: .LBB7_11: # %cond.load5 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vslideup.vi v8, v10, 2 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: beqz a2, .LBB7_4 -; RV64-NEXT: .LBB7_12: # %cond.load9 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vslideup.vi v8, v10, 3 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: beqz a2, .LBB7_5 -; RV64-NEXT: .LBB7_13: # %cond.load13 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vslideup.vi v8, v10, 4 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: beqz a2, .LBB7_6 -; RV64-NEXT: .LBB7_14: # %cond.load17 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vslideup.vi v8, v10, 5 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: beqz a2, .LBB7_7 -; RV64-NEXT: .LBB7_15: # %cond.load21 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vslideup.vi v8, v10, 6 -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: beqz a1, .LBB7_8 -; RV64-NEXT: .LBB7_16: # %cond.load25 -; RV64-NEXT: flw fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vslideup.vi v8, v10, 7 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: viota.m v12, v0 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret %res = call <8 x float> @llvm.masked.expandload.v8f32(ptr align 4 %base, <8 x i1> %mask, <8 x float> %src0) ret <8 x float>%res } declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>) define <1 x double> @expandload_v1f64(ptr %base, <1 x double> %src0, <1 x i1> %mask) { -; RV32-LABEL: expandload_v1f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vfirst.m a1, v0 -; RV32-NEXT: bnez a1, .LBB8_2 -; RV32-NEXT: # %bb.1: # %cond.load -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: .LBB8_2: # %else -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v1f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vfirst.m a1, v0 -; RV64-NEXT: bnez a1, .LBB8_2 -; RV64-NEXT: # %bb.1: # %cond.load -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: .LBB8_2: # %else -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <1 x double> @llvm.masked.expandload.v1f64(ptr align 8 %base, <1 x i1> %mask, <1 x double> %src0) ret <1 x double>%res } declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>) define <2 x double> @expandload_v2f64(ptr %base, <2 x double> %src0, <2 x i1> %mask) { -; RV32-LABEL: expandload_v2f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB9_3 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: bnez a1, .LBB9_4 -; RV32-NEXT: .LBB9_2: # %else2 -; RV32-NEXT: ret -; RV32-NEXT: .LBB9_3: # %cond.load -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: beqz a1, .LBB9_2 -; RV32-NEXT: .LBB9_4: # %cond.load1 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vfmv.s.f v9, fa5 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vslideup.vi v8, v9, 1 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v2f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB9_3 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: bnez a1, .LBB9_4 -; RV64-NEXT: .LBB9_2: # %else2 -; RV64-NEXT: ret -; RV64-NEXT: .LBB9_3: # %cond.load -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: beqz a1, .LBB9_2 -; RV64-NEXT: .LBB9_4: # %cond.load1 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vfmv.s.f v9, fa5 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vslideup.vi v8, v9, 1 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <2 x double> @llvm.masked.expandload.v2f64(ptr align 8 %base, <2 x i1> %mask, <2 x double> %src0) ret <2 x double>%res } declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>) define <4 x double> @expandload_v4f64(ptr %base, <4 x double> %src0, <4 x i1> %mask) { -; RV32-LABEL: expandload_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB10_5 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB10_6 -; RV32-NEXT: .LBB10_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB10_7 -; RV32-NEXT: .LBB10_3: # %else6 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: bnez a1, .LBB10_8 -; RV32-NEXT: .LBB10_4: # %else10 -; RV32-NEXT: ret -; RV32-NEXT: .LBB10_5: # %cond.load -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB10_2 -; RV32-NEXT: .LBB10_6: # %cond.load1 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 1 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB10_3 -; RV32-NEXT: .LBB10_7: # %cond.load5 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vslideup.vi v8, v10, 2 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: beqz a1, .LBB10_4 -; RV32-NEXT: .LBB10_8: # %cond.load9 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vfmv.s.f v10, fa5 -; RV32-NEXT: vslideup.vi v8, v10, 3 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v4f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB10_5 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB10_6 -; RV64-NEXT: .LBB10_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB10_7 -; RV64-NEXT: .LBB10_3: # %else6 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: bnez a1, .LBB10_8 -; RV64-NEXT: .LBB10_4: # %else10 -; RV64-NEXT: ret -; RV64-NEXT: .LBB10_5: # %cond.load -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB10_2 -; RV64-NEXT: .LBB10_6: # %cond.load1 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 1 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB10_3 -; RV64-NEXT: .LBB10_7: # %cond.load5 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vslideup.vi v8, v10, 2 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: beqz a1, .LBB10_4 -; RV64-NEXT: .LBB10_8: # %cond.load9 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vfmv.s.f v10, fa5 -; RV64-NEXT: vslideup.vi v8, v10, 3 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: viota.m v12, v0 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret %res = call <4 x double> @llvm.masked.expandload.v4f64(ptr align 8 %base, <4 x i1> %mask, <4 x double> %src0) ret <4 x double>%res } declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>) define <8 x double> @expandload_v8f64(ptr %base, <8 x double> %src0, <8 x i1> %mask) { -; RV32-LABEL: expandload_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB11_9 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB11_10 -; RV32-NEXT: .LBB11_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB11_11 -; RV32-NEXT: .LBB11_3: # %else6 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: bnez a2, .LBB11_12 -; RV32-NEXT: .LBB11_4: # %else10 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: bnez a2, .LBB11_13 -; RV32-NEXT: .LBB11_5: # %else14 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: bnez a2, .LBB11_14 -; RV32-NEXT: .LBB11_6: # %else18 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: bnez a2, .LBB11_15 -; RV32-NEXT: .LBB11_7: # %else22 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: bnez a1, .LBB11_16 -; RV32-NEXT: .LBB11_8: # %else26 -; RV32-NEXT: ret -; RV32-NEXT: .LBB11_9: # %cond.load -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV32-NEXT: vfmv.s.f v8, fa5 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB11_2 -; RV32-NEXT: .LBB11_10: # %cond.load1 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vfmv.s.f v12, fa5 -; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 1 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB11_3 -; RV32-NEXT: .LBB11_11: # %cond.load5 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV32-NEXT: vfmv.s.f v12, fa5 -; RV32-NEXT: vslideup.vi v8, v12, 2 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: beqz a2, .LBB11_4 -; RV32-NEXT: .LBB11_12: # %cond.load9 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, ma -; RV32-NEXT: vfmv.s.f v12, fa5 -; RV32-NEXT: vslideup.vi v8, v12, 3 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: beqz a2, .LBB11_5 -; RV32-NEXT: .LBB11_13: # %cond.load13 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV32-NEXT: vfmv.s.f v12, fa5 -; RV32-NEXT: vslideup.vi v8, v12, 4 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: beqz a2, .LBB11_6 -; RV32-NEXT: .LBB11_14: # %cond.load17 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV32-NEXT: vfmv.s.f v12, fa5 -; RV32-NEXT: vslideup.vi v8, v12, 5 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: beqz a2, .LBB11_7 -; RV32-NEXT: .LBB11_15: # %cond.load21 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 7, e64, m4, tu, ma -; RV32-NEXT: vfmv.s.f v12, fa5 -; RV32-NEXT: vslideup.vi v8, v12, 6 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: beqz a1, .LBB11_8 -; RV32-NEXT: .LBB11_16: # %cond.load25 -; RV32-NEXT: fld fa5, 0(a0) -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vfmv.s.f v12, fa5 -; RV32-NEXT: vslideup.vi v8, v12, 7 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB11_9 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB11_10 -; RV64-NEXT: .LBB11_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB11_11 -; RV64-NEXT: .LBB11_3: # %else6 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: bnez a2, .LBB11_12 -; RV64-NEXT: .LBB11_4: # %else10 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: bnez a2, .LBB11_13 -; RV64-NEXT: .LBB11_5: # %else14 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: bnez a2, .LBB11_14 -; RV64-NEXT: .LBB11_6: # %else18 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: bnez a2, .LBB11_15 -; RV64-NEXT: .LBB11_7: # %else22 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: bnez a1, .LBB11_16 -; RV64-NEXT: .LBB11_8: # %else26 -; RV64-NEXT: ret -; RV64-NEXT: .LBB11_9: # %cond.load -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV64-NEXT: vfmv.s.f v8, fa5 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB11_2 -; RV64-NEXT: .LBB11_10: # %cond.load1 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vfmv.s.f v12, fa5 -; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma -; RV64-NEXT: vslideup.vi v8, v12, 1 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB11_3 -; RV64-NEXT: .LBB11_11: # %cond.load5 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV64-NEXT: vfmv.s.f v12, fa5 -; RV64-NEXT: vslideup.vi v8, v12, 2 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: beqz a2, .LBB11_4 -; RV64-NEXT: .LBB11_12: # %cond.load9 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, ma -; RV64-NEXT: vfmv.s.f v12, fa5 -; RV64-NEXT: vslideup.vi v8, v12, 3 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: beqz a2, .LBB11_5 -; RV64-NEXT: .LBB11_13: # %cond.load13 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vfmv.s.f v12, fa5 -; RV64-NEXT: vslideup.vi v8, v12, 4 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: beqz a2, .LBB11_6 -; RV64-NEXT: .LBB11_14: # %cond.load17 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vfmv.s.f v12, fa5 -; RV64-NEXT: vslideup.vi v8, v12, 5 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: beqz a2, .LBB11_7 -; RV64-NEXT: .LBB11_15: # %cond.load21 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 7, e64, m4, tu, ma -; RV64-NEXT: vfmv.s.f v12, fa5 -; RV64-NEXT: vslideup.vi v8, v12, 6 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: beqz a1, .LBB11_8 -; RV64-NEXT: .LBB11_16: # %cond.load25 -; RV64-NEXT: fld fa5, 0(a0) -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vfmv.s.f v12, fa5 -; RV64-NEXT: vslideup.vi v8, v12, 7 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64.v v12, (a0) +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; CHECK-NEXT: viota.m v16, v0 +; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: ret %res = call <8 x double> @llvm.masked.expandload.v8f64(ptr align 8 %base, <8 x i1> %mask, <8 x double> %src0) ret <8 x double>%res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-RV32: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll index d34235127e838..269d3df00f05d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll @@ -1,18 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -verify-machineinstrs -mtriple=riscv32 -mattr=+m,+v %s -o - \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 +; RUN: llc -verify-machineinstrs -mtriple=riscv64 -mattr=+m,+v %s -o - \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64 declare <1 x i8> @llvm.masked.expandload.v1i8(ptr, <1 x i1>, <1 x i8>) define <1 x i8> @expandload_v1i8(ptr %base, <1 x i8> %src0, <1 x i1> %mask) { ; CHECK-LABEL: expandload_v1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vfirst.m a1, v0 -; CHECK-NEXT: bnez a1, .LBB0_2 -; CHECK-NEXT: # %bb.1: # %cond.load ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: .LBB0_2: # %else +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <1 x i8> @llvm.masked.expandload.v1i8(ptr %base, <1 x i1> %mask, <1 x i8> %src0) ret <1 x i8>%res @@ -22,28 +24,13 @@ declare <2 x i8> @llvm.masked.expandload.v2i8(ptr, <2 x i1>, <2 x i8>) define <2 x i8> @expandload_v2i8(ptr %base, <2 x i8> %src0, <2 x i1> %mask) { ; CHECK-LABEL: expandload_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB1_3 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: bnez a1, .LBB1_4 -; CHECK-NEXT: .LBB1_2: # %else2 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_3: # %cond.load -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: beqz a1, .LBB1_2 -; CHECK-NEXT: .LBB1_4: # %cond.load1 -; CHECK-NEXT: lbu a0, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %base, <2 x i1> %mask, <2 x i8> %src0) ret <2 x i8>%res @@ -53,50 +40,13 @@ declare <4 x i8> @llvm.masked.expandload.v4i8(ptr, <4 x i1>, <4 x i8>) define <4 x i8> @expandload_v4i8(ptr %base, <4 x i8> %src0, <4 x i1> %mask) { ; CHECK-LABEL: expandload_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB2_5 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB2_6 -; CHECK-NEXT: .LBB2_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB2_7 -; CHECK-NEXT: .LBB2_3: # %else6 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: bnez a1, .LBB2_8 -; CHECK-NEXT: .LBB2_4: # %else10 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_5: # %cond.load -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB2_2 -; CHECK-NEXT: .LBB2_6: # %cond.load1 -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB2_3 -; CHECK-NEXT: .LBB2_7: # %cond.load5 -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 3, e8, mf4, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: beqz a1, .LBB2_4 -; CHECK-NEXT: .LBB2_8: # %cond.load9 -; CHECK-NEXT: lbu a0, 0(a0) ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <4 x i8> @llvm.masked.expandload.v4i8(ptr %base, <4 x i1> %mask, <4 x i8> %src0) ret <4 x i8>%res @@ -106,94 +56,13 @@ declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>) define <8 x i8> @expandload_v8i8(ptr %base, <8 x i8> %src0, <8 x i1> %mask) { ; CHECK-LABEL: expandload_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB3_9 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB3_10 -; CHECK-NEXT: .LBB3_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB3_11 -; CHECK-NEXT: .LBB3_3: # %else6 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: bnez a2, .LBB3_12 -; CHECK-NEXT: .LBB3_4: # %else10 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: bnez a2, .LBB3_13 -; CHECK-NEXT: .LBB3_5: # %else14 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: bnez a2, .LBB3_14 -; CHECK-NEXT: .LBB3_6: # %else18 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: bnez a2, .LBB3_15 -; CHECK-NEXT: .LBB3_7: # %else22 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: bnez a1, .LBB3_16 -; CHECK-NEXT: .LBB3_8: # %else26 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB3_9: # %cond.load -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB3_2 -; CHECK-NEXT: .LBB3_10: # %cond.load1 -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB3_3 -; CHECK-NEXT: .LBB3_11: # %cond.load5 -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 3, e8, mf2, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: beqz a2, .LBB3_4 -; CHECK-NEXT: .LBB3_12: # %cond.load9 -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: beqz a2, .LBB3_5 -; CHECK-NEXT: .LBB3_13: # %cond.load13 -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 5, e8, mf2, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 4 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: beqz a2, .LBB3_6 -; CHECK-NEXT: .LBB3_14: # %cond.load17 -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 5 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: beqz a2, .LBB3_7 -; CHECK-NEXT: .LBB3_15: # %cond.load21 -; CHECK-NEXT: lbu a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 7, e8, mf2, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 6 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: beqz a1, .LBB3_8 -; CHECK-NEXT: .LBB3_16: # %cond.load25 -; CHECK-NEXT: lbu a0, 0(a0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 7 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <8 x i8> @llvm.masked.expandload.v8i8(ptr %base, <8 x i1> %mask, <8 x i8> %src0) ret <8 x i8>%res @@ -203,13 +72,13 @@ declare <1 x i16> @llvm.masked.expandload.v1i16(ptr, <1 x i1>, <1 x i16>) define <1 x i16> @expandload_v1i16(ptr %base, <1 x i16> %src0, <1 x i1> %mask) { ; CHECK-LABEL: expandload_v1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vfirst.m a1, v0 -; CHECK-NEXT: bnez a1, .LBB4_2 -; CHECK-NEXT: # %bb.1: # %cond.load -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: .LBB4_2: # %else +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <1 x i16> @llvm.masked.expandload.v1i16(ptr align 2 %base, <1 x i1> %mask, <1 x i16> %src0) ret <1 x i16>%res @@ -219,28 +88,13 @@ declare <2 x i16> @llvm.masked.expandload.v2i16(ptr, <2 x i1>, <2 x i16>) define <2 x i16> @expandload_v2i16(ptr %base, <2 x i16> %src0, <2 x i1> %mask) { ; CHECK-LABEL: expandload_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB5_3 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: bnez a1, .LBB5_4 -; CHECK-NEXT: .LBB5_2: # %else2 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB5_3: # %cond.load -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: beqz a1, .LBB5_2 -; CHECK-NEXT: .LBB5_4: # %cond.load1 -; CHECK-NEXT: lh a0, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <2 x i16> @llvm.masked.expandload.v2i16(ptr align 2 %base, <2 x i1> %mask, <2 x i16> %src0) ret <2 x i16>%res @@ -250,50 +104,13 @@ declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>) define <4 x i16> @expandload_v4i16(ptr %base, <4 x i16> %src0, <4 x i1> %mask) { ; CHECK-LABEL: expandload_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB6_5 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB6_6 -; CHECK-NEXT: .LBB6_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB6_7 -; CHECK-NEXT: .LBB6_3: # %else6 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: bnez a1, .LBB6_8 -; CHECK-NEXT: .LBB6_4: # %else10 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB6_5: # %cond.load -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB6_2 -; CHECK-NEXT: .LBB6_6: # %cond.load1 -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB6_3 -; CHECK-NEXT: .LBB6_7: # %cond.load5 -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: beqz a1, .LBB6_4 -; CHECK-NEXT: .LBB6_8: # %cond.load9 -; CHECK-NEXT: lh a0, 0(a0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <4 x i16> @llvm.masked.expandload.v4i16(ptr align 2 %base, <4 x i1> %mask, <4 x i16> %src0) ret <4 x i16>%res @@ -303,94 +120,13 @@ declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>) define <8 x i16> @expandload_v8i16(ptr %base, <8 x i16> %src0, <8 x i1> %mask) { ; CHECK-LABEL: expandload_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB7_9 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB7_10 -; CHECK-NEXT: .LBB7_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB7_11 -; CHECK-NEXT: .LBB7_3: # %else6 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: bnez a2, .LBB7_12 -; CHECK-NEXT: .LBB7_4: # %else10 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: bnez a2, .LBB7_13 -; CHECK-NEXT: .LBB7_5: # %else14 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: bnez a2, .LBB7_14 -; CHECK-NEXT: .LBB7_6: # %else18 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: bnez a2, .LBB7_15 -; CHECK-NEXT: .LBB7_7: # %else22 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: bnez a1, .LBB7_16 -; CHECK-NEXT: .LBB7_8: # %else26 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB7_9: # %cond.load -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB7_2 -; CHECK-NEXT: .LBB7_10: # %cond.load1 -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB7_3 -; CHECK-NEXT: .LBB7_11: # %cond.load5 -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: beqz a2, .LBB7_4 -; CHECK-NEXT: .LBB7_12: # %cond.load9 -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: beqz a2, .LBB7_5 -; CHECK-NEXT: .LBB7_13: # %cond.load13 -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 4 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: beqz a2, .LBB7_6 -; CHECK-NEXT: .LBB7_14: # %cond.load17 -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 5 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: beqz a2, .LBB7_7 -; CHECK-NEXT: .LBB7_15: # %cond.load21 -; CHECK-NEXT: lh a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 6 -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: beqz a1, .LBB7_8 -; CHECK-NEXT: .LBB7_16: # %cond.load25 -; CHECK-NEXT: lh a0, 0(a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 7 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <8 x i16> @llvm.masked.expandload.v8i16(ptr align 2 %base, <8 x i1> %mask, <8 x i16> %src0) ret <8 x i16>%res @@ -400,13 +136,13 @@ declare <1 x i32> @llvm.masked.expandload.v1i32(ptr, <1 x i1>, <1 x i32>) define <1 x i32> @expandload_v1i32(ptr %base, <1 x i32> %src0, <1 x i1> %mask) { ; CHECK-LABEL: expandload_v1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vfirst.m a1, v0 -; CHECK-NEXT: bnez a1, .LBB8_2 -; CHECK-NEXT: # %bb.1: # %cond.load -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: .LBB8_2: # %else +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <1 x i32> @llvm.masked.expandload.v1i32(ptr align 4 %base, <1 x i1> %mask, <1 x i32> %src0) ret <1 x i32>%res @@ -416,28 +152,13 @@ declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>) define <2 x i32> @expandload_v2i32(ptr %base, <2 x i32> %src0, <2 x i1> %mask) { ; CHECK-LABEL: expandload_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB9_3 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: bnez a1, .LBB9_4 -; CHECK-NEXT: .LBB9_2: # %else2 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB9_3: # %cond.load -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: beqz a1, .LBB9_2 -; CHECK-NEXT: .LBB9_4: # %cond.load1 -; CHECK-NEXT: lw a0, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <2 x i32> @llvm.masked.expandload.v2i32(ptr align 4 %base, <2 x i1> %mask, <2 x i32> %src0) ret <2 x i32>%res @@ -447,50 +168,13 @@ declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>) define <4 x i32> @expandload_v4i32(ptr %base, <4 x i32> %src0, <4 x i1> %mask) { ; CHECK-LABEL: expandload_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB10_5 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB10_6 -; CHECK-NEXT: .LBB10_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB10_7 -; CHECK-NEXT: .LBB10_3: # %else6 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: bnez a1, .LBB10_8 -; CHECK-NEXT: .LBB10_4: # %else10 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB10_5: # %cond.load -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB10_2 -; CHECK-NEXT: .LBB10_6: # %cond.load1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB10_3 -; CHECK-NEXT: .LBB10_7: # %cond.load5 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vmv.s.x v9, a2 -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: beqz a1, .LBB10_4 -; CHECK-NEXT: .LBB10_8: # %cond.load9 -; CHECK-NEXT: lw a0, 0(a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t ; CHECK-NEXT: ret %res = call <4 x i32> @llvm.masked.expandload.v4i32(ptr align 4 %base, <4 x i1> %mask, <4 x i32> %src0) ret <4 x i32>%res @@ -500,94 +184,13 @@ declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>) define <8 x i32> @expandload_v8i32(ptr %base, <8 x i32> %src0, <8 x i1> %mask) { ; CHECK-LABEL: expandload_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB11_9 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB11_10 -; CHECK-NEXT: .LBB11_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB11_11 -; CHECK-NEXT: .LBB11_3: # %else6 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: bnez a2, .LBB11_12 -; CHECK-NEXT: .LBB11_4: # %else10 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: bnez a2, .LBB11_13 -; CHECK-NEXT: .LBB11_5: # %else14 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: bnez a2, .LBB11_14 -; CHECK-NEXT: .LBB11_6: # %else18 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: bnez a2, .LBB11_15 -; CHECK-NEXT: .LBB11_7: # %else22 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: bnez a1, .LBB11_16 -; CHECK-NEXT: .LBB11_8: # %else26 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_9: # %cond.load -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma -; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB11_2 -; CHECK-NEXT: .LBB11_10: # %cond.load1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 1 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB11_3 -; CHECK-NEXT: .LBB11_11: # %cond.load5 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: vslideup.vi v8, v10, 2 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: beqz a2, .LBB11_4 -; CHECK-NEXT: .LBB11_12: # %cond.load9 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: vslideup.vi v8, v10, 3 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: beqz a2, .LBB11_5 -; CHECK-NEXT: .LBB11_13: # %cond.load13 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: beqz a2, .LBB11_6 -; CHECK-NEXT: .LBB11_14: # %cond.load17 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: beqz a2, .LBB11_7 -; CHECK-NEXT: .LBB11_15: # %cond.load21 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vmv.s.x v10, a2 -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: beqz a1, .LBB11_8 -; CHECK-NEXT: .LBB11_16: # %cond.load25 -; CHECK-NEXT: lw a0, 0(a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: viota.m v12, v0 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %res = call <8 x i32> @llvm.masked.expandload.v8i32(ptr align 4 %base, <8 x i1> %mask, <8 x i32> %src0) ret <8 x i32>%res @@ -595,418 +198,67 @@ define <8 x i32> @expandload_v8i32(ptr %base, <8 x i32> %src0, <8 x i1> %mask) { declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>) define <1 x i64> @expandload_v1i64(ptr %base, <1 x i64> %src0, <1 x i1> %mask) { -; RV32-LABEL: expandload_v1i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vfirst.m a1, v0 -; RV32-NEXT: bnez a1, .LBB12_2 -; RV32-NEXT: # %bb.1: # %cond.load -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lw a1, 0(a0) -; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .LBB12_2: # %else -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v1i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vfirst.m a1, v0 -; RV64-NEXT: bnez a1, .LBB12_2 -; RV64-NEXT: # %bb.1: # %cond.load -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: .LBB12_2: # %else -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <1 x i64> @llvm.masked.expandload.v1i64(ptr align 8 %base, <1 x i1> %mask, <1 x i64> %src0) ret <1 x i64>%res } declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>) define <2 x i64> @expandload_v2i64(ptr %base, <2 x i64> %src0, <2 x i1> %mask) { -; RV32-LABEL: expandload_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB13_3 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: bnez a1, .LBB13_4 -; RV32-NEXT: .LBB13_2: # %else2 -; RV32-NEXT: ret -; RV32-NEXT: .LBB13_3: # %cond.load -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: beqz a1, .LBB13_2 -; RV32-NEXT: .LBB13_4: # %cond.load1 -; RV32-NEXT: lw a1, 0(a0) -; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV32-NEXT: vslide1down.vx v9, v8, a1 -; RV32-NEXT: vslide1down.vx v9, v9, a0 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vslideup.vi v8, v9, 1 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB13_3 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: bnez a1, .LBB13_4 -; RV64-NEXT: .LBB13_2: # %else2 -; RV64-NEXT: ret -; RV64-NEXT: .LBB13_3: # %cond.load -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV64-NEXT: vmv.s.x v8, a2 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: beqz a1, .LBB13_2 -; RV64-NEXT: .LBB13_4: # %cond.load1 -; RV64-NEXT: ld a0, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.s.x v9, a0 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vslideup.vi v8, v9, 1 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: viota.m v10, v0 +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret %res = call <2 x i64> @llvm.masked.expandload.v2i64(ptr align 8 %base, <2 x i1> %mask, <2 x i64> %src0) ret <2 x i64>%res } declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>) define <4 x i64> @expandload_v4i64(ptr %base, <4 x i64> %src0, <4 x i1> %mask) { -; RV32-LABEL: expandload_v4i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB14_5 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB14_6 -; RV32-NEXT: .LBB14_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB14_7 -; RV32-NEXT: .LBB14_3: # %else6 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: bnez a1, .LBB14_8 -; RV32-NEXT: .LBB14_4: # %else10 -; RV32-NEXT: ret -; RV32-NEXT: .LBB14_5: # %cond.load -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB14_2 -; RV32-NEXT: .LBB14_6: # %cond.load1 -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV32-NEXT: vslide1down.vx v10, v8, a2 -; RV32-NEXT: vslide1down.vx v10, v10, a3 -; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 1 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB14_3 -; RV32-NEXT: .LBB14_7: # %cond.load5 -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma -; RV32-NEXT: vslide1down.vx v10, v8, a2 -; RV32-NEXT: vslide1down.vx v10, v10, a3 -; RV32-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 2 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: beqz a1, .LBB14_4 -; RV32-NEXT: .LBB14_8: # %cond.load9 -; RV32-NEXT: lw a1, 0(a0) -; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma -; RV32-NEXT: vslide1down.vx v10, v8, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a0 -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vslideup.vi v8, v10, 3 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v4i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB14_5 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB14_6 -; RV64-NEXT: .LBB14_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB14_7 -; RV64-NEXT: .LBB14_3: # %else6 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: bnez a1, .LBB14_8 -; RV64-NEXT: .LBB14_4: # %else10 -; RV64-NEXT: ret -; RV64-NEXT: .LBB14_5: # %cond.load -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV64-NEXT: vmv.s.x v8, a2 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB14_2 -; RV64-NEXT: .LBB14_6: # %cond.load1 -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.s.x v10, a2 -; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 1 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB14_3 -; RV64-NEXT: .LBB14_7: # %cond.load5 -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV64-NEXT: vmv.s.x v10, a2 -; RV64-NEXT: vslideup.vi v8, v10, 2 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: beqz a1, .LBB14_4 -; RV64-NEXT: .LBB14_8: # %cond.load9 -; RV64-NEXT: ld a0, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vslideup.vi v8, v10, 3 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: viota.m v12, v0 +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret %res = call <4 x i64> @llvm.masked.expandload.v4i64(ptr align 8 %base, <4 x i1> %mask, <4 x i64> %src0) ret <4 x i64>%res } declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>) define <8 x i64> @expandload_v8i64(ptr %base, <8 x i64> %src0, <8 x i1> %mask) { -; RV32-LABEL: expandload_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB15_9 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB15_10 -; RV32-NEXT: .LBB15_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB15_11 -; RV32-NEXT: .LBB15_3: # %else6 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: bnez a2, .LBB15_12 -; RV32-NEXT: .LBB15_4: # %else10 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: bnez a2, .LBB15_13 -; RV32-NEXT: .LBB15_5: # %else14 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: bnez a2, .LBB15_14 -; RV32-NEXT: .LBB15_6: # %else18 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: bnez a2, .LBB15_15 -; RV32-NEXT: .LBB15_7: # %else22 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: bnez a1, .LBB15_16 -; RV32-NEXT: .LBB15_8: # %else26 -; RV32-NEXT: ret -; RV32-NEXT: .LBB15_9: # %cond.load -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; RV32-NEXT: vslide1down.vx v8, v8, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB15_2 -; RV32-NEXT: .LBB15_10: # %cond.load1 -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a2 -; RV32-NEXT: vslide1down.vx v12, v12, a3 -; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 1 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB15_3 -; RV32-NEXT: .LBB15_11: # %cond.load5 -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a2 -; RV32-NEXT: vslide1down.vx v12, v12, a3 -; RV32-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 2 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: beqz a2, .LBB15_4 -; RV32-NEXT: .LBB15_12: # %cond.load9 -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a2 -; RV32-NEXT: vslide1down.vx v12, v12, a3 -; RV32-NEXT: vsetivli zero, 4, e64, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 3 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: beqz a2, .LBB15_5 -; RV32-NEXT: .LBB15_13: # %cond.load13 -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m4, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a2 -; RV32-NEXT: vslide1down.vx v12, v12, a3 -; RV32-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 4 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: beqz a2, .LBB15_6 -; RV32-NEXT: .LBB15_14: # %cond.load17 -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m4, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a2 -; RV32-NEXT: vslide1down.vx v12, v12, a3 -; RV32-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 5 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: beqz a2, .LBB15_7 -; RV32-NEXT: .LBB15_15: # %cond.load21 -; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m4, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a2 -; RV32-NEXT: vslide1down.vx v12, v12, a3 -; RV32-NEXT: vsetivli zero, 7, e64, m4, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 6 -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: beqz a1, .LBB15_8 -; RV32-NEXT: .LBB15_16: # %cond.load25 -; RV32-NEXT: lw a1, 0(a0) -; RV32-NEXT: lw a0, 4(a0) -; RV32-NEXT: vsetivli zero, 2, e32, m4, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a1 -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vslideup.vi v8, v12, 7 -; RV32-NEXT: ret -; -; RV64-LABEL: expandload_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB15_9 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB15_10 -; RV64-NEXT: .LBB15_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB15_11 -; RV64-NEXT: .LBB15_3: # %else6 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: bnez a2, .LBB15_12 -; RV64-NEXT: .LBB15_4: # %else10 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: bnez a2, .LBB15_13 -; RV64-NEXT: .LBB15_5: # %else14 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: bnez a2, .LBB15_14 -; RV64-NEXT: .LBB15_6: # %else18 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: bnez a2, .LBB15_15 -; RV64-NEXT: .LBB15_7: # %else22 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: bnez a1, .LBB15_16 -; RV64-NEXT: .LBB15_8: # %else26 -; RV64-NEXT: ret -; RV64-NEXT: .LBB15_9: # %cond.load -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, tu, ma -; RV64-NEXT: vmv.s.x v8, a2 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB15_2 -; RV64-NEXT: .LBB15_10: # %cond.load1 -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.s.x v12, a2 -; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma -; RV64-NEXT: vslideup.vi v8, v12, 1 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB15_3 -; RV64-NEXT: .LBB15_11: # %cond.load5 -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; RV64-NEXT: vmv.s.x v12, a2 -; RV64-NEXT: vslideup.vi v8, v12, 2 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: beqz a2, .LBB15_4 -; RV64-NEXT: .LBB15_12: # %cond.load9 -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 4, e64, m2, tu, ma -; RV64-NEXT: vmv.s.x v12, a2 -; RV64-NEXT: vslideup.vi v8, v12, 3 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: beqz a2, .LBB15_5 -; RV64-NEXT: .LBB15_13: # %cond.load13 -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.s.x v12, a2 -; RV64-NEXT: vslideup.vi v8, v12, 4 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: beqz a2, .LBB15_6 -; RV64-NEXT: .LBB15_14: # %cond.load17 -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.s.x v12, a2 -; RV64-NEXT: vslideup.vi v8, v12, 5 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: beqz a2, .LBB15_7 -; RV64-NEXT: .LBB15_15: # %cond.load21 -; RV64-NEXT: ld a2, 0(a0) -; RV64-NEXT: vsetivli zero, 7, e64, m4, tu, ma -; RV64-NEXT: vmv.s.x v12, a2 -; RV64-NEXT: vslideup.vi v8, v12, 6 -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: beqz a1, .LBB15_8 -; RV64-NEXT: .LBB15_16: # %cond.load25 -; RV64-NEXT: ld a0, 0(a0) -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vmv.s.x v12, a0 -; RV64-NEXT: vslideup.vi v8, v12, 7 -; RV64-NEXT: ret +; CHECK-LABEL: expandload_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64.v v12, (a0) +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; CHECK-NEXT: viota.m v16, v0 +; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: ret %res = call <8 x i64> @llvm.masked.expandload.v8i64(ptr align 8 %base, <8 x i1> %mask, <8 x i64> %src0) ret <8 x i64>%res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-RV32: {{.*}} +; CHECK-RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll index d309da6df7dc7..f2052ccc46279 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32NOM -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32M -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64NOM -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64M +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32NOM +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32M +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64NOM +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d,+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64M define i8 @extractelt_v16i8(ptr %x) nounwind { ; CHECK-LABEL: extractelt_v16i8: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll index 404fb72b8abe9..84895715e814f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll index 2319aab370d2d..3c99870dba950 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <1 x half> @floor_v1f16(<1 x half> %x) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll index c1b4c5fda6c64..87061581af739 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.floor.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll index 51eb63f5f9221..731f57a3a6d29 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.maximum.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll index 02c2fafc89785..ae592119cf881 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll index 03e0ac42c442c..dae1399d66900 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.minimum.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll index b15d697f0754e..8e042fc0785e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.minimum.v2f16(<2 x half>, <2 x half>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll index 719dd52494284..0b9fabb832e29 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll index 5f5015c9ad169..5b35c0083ca0c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV32-FP -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV64-FP diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 96b9b2bac2993..924732e554f0e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFH -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zba,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFH -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RV64V -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+rva22u64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RVA22U64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFH +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zba,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFH +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RV64V +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+rva22u64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RVA22U64 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN,RV32-NO-ZFHMIN ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN,RV64-NO-ZFHMIN ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN,RV32-ZFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll index d0dc70fd81151..ff52f5d2039e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s define void @fpext_v2f16_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: fpext_v2f16_v2f32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index f3b124aa34dcb..a138b02b6139d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 ; Test optimizing interleaves to widening arithmetic. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll index 250b3e90cbbb6..0d0ef9c87946c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-splat.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFH +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFH ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFHMIN,RV64-ZVFHMIN-NOZFHMIN ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,RV64-ZVFHMIN,RV64_ZVFHMIN-ZFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll index d92dc3edecb0b..f0e6df6298471 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define void @fp2si_v2f32_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: fp2si_v2f32_v2i32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll index 9d92018db2e88..da0bc5983a125 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64 -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 define void @fp2si_v2f32_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: fp2si_v2f32_v2i32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll index 31d7844251a77..a1466d46f1ba7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fpext-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s declare <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp-mask.ll index 602662b184290..bc86be6f62fd1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp-mask.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <4 x i1> @llvm.vp.fptosi.v4i1.v4f16(<4 x half>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll index a4050b716e787..f6c992280c6e3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptosi-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <4 x i7> @llvm.vp.fptosi.v4i7.v4f16(<4 x half>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp-mask.ll index c5bfd41ec9510..c41f14076db31 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp-mask.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <4 x i1> @llvm.vp.fptoui.v4i1.v4f16(<4 x half>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll index b652cdd88c7c2..af225f4d95aa2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptoui-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <4 x i7> @llvm.vp.fptoui.v4i7.v4f16(<4 x half>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll index cd123cdf33a8f..e64c7c87132ee 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll index e855d9504ff40..131fa53b35999 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This file tests the code generation for `llvm.experimental.constrained.round.*` on scalable vector type. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll index 3f1bc03435840..b21be367f8ef5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN ; This file tests the code generation for `llvm.round.*` on fixed vector type. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll index 9976cd2a8ab29..37f2e59ad7516 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This file tests the code generation for `llvm.experimental.constrained.roundeven.*` on scalable vector type. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll index 9607aa09d89d6..13d62bb24441c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN ; This file tests the code generation for `llvm.roundeven.*` on fixed vector type. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll index eac26451d5a8c..b911722368ce3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <1 x half> @trunc_v1f16(<1 x half> %x) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll index 9cdc9b81c9530..29f8730021ce4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64 -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64 define void @si2fp_v2i32_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: si2fp_v2i32_v2f32: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll index 7853e91ca53a3..87f9bfbd1aaff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <4 x i32> @insertelt_v4i32_0(<4 x i32> %a, i32 %y) { ; CHECK-LABEL: insertelt_v4i32_0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll index 8de9cc25ae09a..7de9b59c6853f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s ; Integers diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll index 3bf8d10654acd..2fab2b76ee27a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s define void @v2i8(ptr %p, ptr %q) { ; CHECK-LABEL: v2i8: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll index 22cde3c36ef61..8f1e026d09c0a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s define <5 x i8> @load_v5i8(ptr %p) { ; CHECK-LABEL: load_v5i8: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 9c6ec6aef6034..5802f45d311b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F declare <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i8>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll index 79b05334cb1f9..f1d300b300a64 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define void @masked_load_v1f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { ; CHECK-LABEL: masked_load_v1f16: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 323f08acac28e..a445c8fe08172 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64ZVE32F declare void @llvm.masked.scatter.v1i8.v1p0(<1 x i8>, <1 x ptr>, i32, <1 x i1>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll index a1e81ea41c249..80110b3eef4dd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define void @masked_store_v1f16(<1 x half>* %val_ptr, <1 x half>* %a, <1 x half>* %m_ptr) nounwind { ; CHECK-LABEL: masked_store_v1f16: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index 93b4f7d2a9c9f..3fab9ce636786 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll index 6d5be7f14bf70..6684e6d223eac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare half @llvm.vp.reduce.fadd.v2f16(half, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll index 1f856d04ca89f..79dc2db8b1691 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.rint.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll index 0f587232680df..2228147176de5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.round.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll index 0fb7e6a7de569..336ffc8603faf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.roundeven.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll index 927f96b644227..9f7124f1e4d9f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll new file mode 100644 index 0000000000000..4621f339ca882 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s + +define <8 x float> @fpext_v8bf16(<8 x bfloat> %x) { +; CHECK-LABEL: fpext_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a0, fa0 +; CHECK-NEXT: fmv.x.w a1, fa1 +; CHECK-NEXT: fmv.x.w a2, fa2 +; CHECK-NEXT: fmv.x.w a3, fa3 +; CHECK-NEXT: fmv.x.w a4, fa4 +; CHECK-NEXT: fmv.x.w a5, fa5 +; CHECK-NEXT: fmv.x.w a6, fa6 +; CHECK-NEXT: fmv.x.w a7, fa7 +; CHECK-NEXT: slli a7, a7, 16 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vmv.s.x v8, a7 +; CHECK-NEXT: slli a6, a6, 16 +; CHECK-NEXT: vmv.s.x v9, a6 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: slli a5, a5, 16 +; CHECK-NEXT: vmv.s.x v8, a5 +; CHECK-NEXT: slli a4, a4, 16 +; CHECK-NEXT: vmv.s.x v10, a4 +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: slli a3, a3, 16 +; CHECK-NEXT: vmv.s.x v8, a3 +; CHECK-NEXT: slli a2, a2, 16 +; CHECK-NEXT: vmv.s.x v9, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: slli a1, a1, 16 +; CHECK-NEXT: vmv.s.x v11, a1 +; CHECK-NEXT: slli a0, a0, 16 +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vslideup.vi v8, v11, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: ret + %y = fpext <8 x bfloat> %x to <8 x float> + ret <8 x float> %y +} + +define <8 x float> @fpext_v8f16(<8 x bfloat> %x) { +; CHECK-LABEL: fpext_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a0, fa0 +; CHECK-NEXT: fmv.x.w a1, fa1 +; CHECK-NEXT: fmv.x.w a2, fa2 +; CHECK-NEXT: fmv.x.w a3, fa3 +; CHECK-NEXT: fmv.x.w a4, fa4 +; CHECK-NEXT: fmv.x.w a5, fa5 +; CHECK-NEXT: fmv.x.w a6, fa6 +; CHECK-NEXT: fmv.x.w a7, fa7 +; CHECK-NEXT: slli a7, a7, 16 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vmv.s.x v8, a7 +; CHECK-NEXT: slli a6, a6, 16 +; CHECK-NEXT: vmv.s.x v9, a6 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: slli a5, a5, 16 +; CHECK-NEXT: vmv.s.x v8, a5 +; CHECK-NEXT: slli a4, a4, 16 +; CHECK-NEXT: vmv.s.x v10, a4 +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: slli a3, a3, 16 +; CHECK-NEXT: vmv.s.x v8, a3 +; CHECK-NEXT: slli a2, a2, 16 +; CHECK-NEXT: vmv.s.x v9, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: slli a1, a1, 16 +; CHECK-NEXT: vmv.s.x v11, a1 +; CHECK-NEXT: slli a0, a0, 16 +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vslideup.vi v8, v11, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: ret + %y = fpext <8 x bfloat> %x to <8 x float> + ret <8 x float> %y +} + diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index cbf9829826fb6..f5c45ba9ea581 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NO-ZVBB,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NO-ZVBB,RV64 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVBB,RV32-ZVBB -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVBB,RV64-ZVBB +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NO-ZVBB,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NO-ZVBB,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVBB,RV32-ZVBB +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVBB,RV64-ZVBB define <2 x i1> @reverse_v2i1(<2 x i1> %a) { ; NO-ZVBB-LABEL: reverse_v2i1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll index dffe0e0646ecc..814e35f201dca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll index ff5f6960ed4e4..f531ff3a835e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll index e0b2dd1af9183..b3390b6eeeccd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll index 4ef65032469e4..a3e50685889d1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zvl256b \ +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+zvl256b \ ; RUN: -lower-interleaved-accesses=false -verify-machineinstrs \ ; RUN: | FileCheck %s --check-prefixes=CHECK,V -; RUN: llc < %s -mtriple=riscv64 -mattr=+f,+zve32f,+zfh,+zvfh,+zvl256b \ +; RUN: llc < %s -mtriple=riscv64 -mattr=+f,+zve32f,+zvfh,+zvl256b \ ; RUN: -lower-interleaved-accesses=false -verify-machineinstrs \ ; RUN: | FileCheck %s --check-prefixes=CHECK,ZVE32F diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp-mask.ll index 67c045cc2b189..a1390a8b1c0de 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp-mask.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh < %s | FileCheck %s declare <4 x half> @llvm.vp.sitofp.v4f16.v4i1(<4 x i1>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll index bf0eab77d0ac8..a2d41de5d1853 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sitofp-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin < %s | FileCheck %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN declare <4 x half> @llvm.vp.sitofp.v4f16.v4i7(<4 x i7>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll index 169d99abb13c2..5232d0d69fad0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s define void @store_v5i8(ptr %p, <5 x i8> %v) { ; CHECK-LABEL: store_v5i8: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll index b49e323478e8c..ed72883e9d052 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN ; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp-mask.ll index adfb26cd31060..e625c46a57145 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp-mask.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh < %s | FileCheck %s declare <4 x half> @llvm.vp.uitofp.v4f16.v4i1(<4 x i1>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll index e28d55f46abcb..a0d5d2ccc848d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-uitofp-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <4 x half> @llvm.vp.uitofp.v4f16.v4i7(<4 x i7>, <4 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll index 77a095303675f..f1dc476e5a430 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.copysign.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll index 90a856605c70d..08f486b601328 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfabs-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.fabs.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-constrained-sdnode.ll index 441cbebf56757..599f505808ab4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half>, <2 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfclass-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfclass-vp.ll index 09b9e7ce4c53d..690c8af7fc8e7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfclass-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfclass-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <2 x i1> @isnan_v2f16(<2 x half> %x, <2 x i1> %m, i32 zeroext %evl) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfclass.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfclass.ll index a1c36db5cfc31..85e8638301ded 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfclass.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfclass.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <2 x i1> @isnan_v2f16(<2 x half> %x) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll index 1a75c50f2b646..4e9862b05f408 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <1 x i1> @llvm.experimental.constrained.fcmp.v1f16(<1 x half>, <1 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll index 83037baf3fabc..97641ff6d92d7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <1 x i1> @llvm.experimental.constrained.fcmps.v1f16(<1 x half>, <1 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-constrained-sdnode.ll index fb9612d095040..1bc880d93af1a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.experimental.constrained.fdiv.v2f16(<2 x half>, <2 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmacc-vp.ll index 2d6e1fd02dee5..bc13e1d217a9b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmacc-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.fma.v2f16(<2 x half>, <2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmadd-constrained-sdnode.ll index 52d96fc63fadf..b8f3f0fef0419 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmadd-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This tests a mix of vfmacc and vfmadd by using different operand orders to diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll index c83a298cb501e..1f3c7a915d848 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.maxnum.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll index 60dbededb90a5..a3dbd33451374 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.minnum.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsac-vp.ll index fc6578225aa64..99fc035235671 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsac-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.fma.v2f16(<2 x half>, <2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsub-constrained-sdnode.ll index 652198b0d4469..268494bf337e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsub-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This tests a mix of vfmsac and vfmsub by using different operand orders to diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-constrained-sdnode.ll index d5e96c88f9388..c8148a5e8d49c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <1 x half> @llvm.experimental.constrained.fmul.v1f16(<1 x half>, <1 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll index 6c695b43d2718..3912a37e6beb2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll index 019923ffdfded..968fd9f9bab80 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfneg-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.fneg.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmacc-vp.ll index 6d65ab4083f7c..4ab94444b1b89 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmacc-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.fma.v2f16(<2 x half>, <2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmadd-constrained-sdnode.ll index b7f5dd49b3508..afc89717596b2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmadd-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This tests a mix of vfnmacc and vfnmadd by using different operand orders to diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsac-vp.ll index df705270664bc..4d9b002cc785c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsac-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.fma.v2f16(<2 x half>, <2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsub-constrained-sdnode.ll index ace96c1a571d1..d9863bb36c739 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsub-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This tests a mix of vfnmsac and vfnmsub by using different operand orders to diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll index 5321f731441e3..59fd8bbd17953 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfpext-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptrunc-constrained-sdnode.ll index 4ac72bf0a3b00..1f74691437ad2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptrunc-constrained-sdnode.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfrdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfrdiv-vp.ll index bd354b7dae803..fb813d4381a7d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfrdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfrdiv-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.fdiv.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfrsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfrsub-vp.ll index 0903ef8c8ec3f..63c2d1f2e7db3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfrsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfrsub-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.vp.fsub.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-constrained-sdnode.ll index 9f29d14050de7..62d03e1ab588a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.experimental.constrained.sqrt.v2f16(<2 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll index 988b200ae5365..c1e63cbf0b138 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsqrt-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare <2 x half> @llvm.vp.sqrt.v2f16(<2 x half>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-constrained-sdnode.ll index f9d40d7a117b5..e6001352a237b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half>, <2 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll index cb50ca4a72120..1144f776e7fbf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN ; Check that the default value enables the web folding and ; that it is bigger than 3. -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) { ; NO_FOLDING-LABEL: vfwmul_v2f116_multiple_users: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll index afea1dc6d3c2a..4bd521725f488 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+f,+d -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+f,+d -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <2 x float> @vfwadd_v2f16(ptr %x, ptr %y) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll index 5140d89b78307..a48be7687106e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <1 x float> @llvm.fma.v1f32(<1 x float>, <1 x float>, <1 x float>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll index 319994d265565..84c1262177891 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+f,+d -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+f,+d -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <2 x float> @vfwmul_v2f16(ptr %x, ptr %y) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll index 2c706cad9742f..b8b26a4d5adf0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+f,+d -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+f,+d -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+f,+d -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define <2 x float> @vfwsub_v2f16(ptr %x, ptr %y) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vitofp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vitofp-constrained-sdnode.ll index 3dec7daf66ac9..5eb54fc7e299a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vitofp-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vitofp-constrained-sdnode.ll @@ -412,30 +412,20 @@ declare <1 x half> @llvm.experimental.constrained.sitofp.v1f16.v1i7(<1 x i7>, me define <1 x half> @vsitofp_v1i7_v1f16(<1 x i7> %va) strictfp { ; RV32-LABEL: vsitofp_v1i7_v1f16: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: slli a0, a0, 25 ; RV32-NEXT: srai a0, a0, 25 ; RV32-NEXT: fcvt.h.w fa5, a0 -; RV32-NEXT: fsh fa5, 14(sp) -; RV32-NEXT: addi a0, sp, 14 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vle16.v v8, (a0) -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-NEXT: vfmv.s.f v8, fa5 ; RV32-NEXT: ret ; ; RV64-LABEL: vsitofp_v1i7_v1f16: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: slli a0, a0, 57 ; RV64-NEXT: srai a0, a0, 57 ; RV64-NEXT: fcvt.h.w fa5, a0 -; RV64-NEXT: fsh fa5, 14(sp) -; RV64-NEXT: addi a0, sp, 14 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV64-NEXT: vle16.v v8, (a0) -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64-NEXT: vfmv.s.f v8, fa5 ; RV64-NEXT: ret %evec = call <1 x half> @llvm.experimental.constrained.sitofp.v1f16.v1i7(<1 x i7> %va, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <1 x half> %evec @@ -445,15 +435,10 @@ declare <1 x half> @llvm.experimental.constrained.uitofp.v1f16.v1i7(<1 x i7>, me define <1 x half> @vuitofp_v1i7_v1f16(<1 x i7> %va) strictfp { ; CHECK-LABEL: vuitofp_v1i7_v1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: andi a0, a0, 127 ; CHECK-NEXT: fcvt.h.wu fa5, a0 -; CHECK-NEXT: fsh fa5, 14(sp) -; CHECK-NEXT: addi a0, sp, 14 -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa5 ; CHECK-NEXT: ret %evec = call <1 x half> @llvm.experimental.constrained.uitofp.v1f16.v1i7(<1 x i7> %va, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <1 x half> %evec diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll index a31405f75e8a4..2e3e36e45d571 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vp-splat.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define <1 x i8> @vp_splat_v1i8(i8 %val, <1 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_splat_v1i8: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll index 028fb9a626f02..f66974e511406 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 declare <2 x i8> @llvm.vp.gather.v2i8.v2p0(<2 x ptr>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll index f204d812c14f6..351d7d4cd9b09 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <2 x i8> @llvm.vp.load.v2i8.p0(ptr, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll index c055039876191..0a61bc42326f4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 declare void @llvm.vp.scatter.v2i8.v2p0(<2 x i8>, <2 x ptr>, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll index f396790f4f178..d34292abdce0d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.vp.store.v2i8.p0(<2 x i8>, ptr, <2 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll index 0a2ed3eb1ffbf..99aafdbcde127 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+m -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+m -target-abi=lp64d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <1 x i1> @llvm.vp.select.v1i1(<1 x i1>, <1 x i1>, <1 x i1>, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll index 2194651a95e54..a3bba2dd8265c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64 define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV32-LABEL: vselect_vv_v6i32: diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll index 03d1fb6c8d297..40c855b5d0451 100644 --- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll index d8c3ab27cfad1..ec5b0136c3830 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare @llvm.maximum.nxv1bf16(, ) diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll index dd01e1c1ee66d..7b70a0daf11c8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 \ -; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN ; RUN: llc -mtriple=riscv64 \ -; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll index 2371840002f40..4061cbca0c48d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN declare @llvm.minimum.nxv1bf16(, ) diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll index 85cac8d187059..2526b87651779 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v,+m \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 \ -; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN ; RUN: llc -mtriple=riscv64 \ -; RUN: -mattr=+d,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ +; RUN: -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v,+m \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll index 372937bb5ca5d..f22cd77db7a40 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.experimental.constrained.nearbyint.nxv1f16(, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll index 9498c65ba9a17..89769615365ca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll index adfae5ede7bb5..5bc1ab9820d6c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zbb -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zbb -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s define i64 @reduce_add(i64 %x, <4 x i64> %v) { ; CHECK-LABEL: reduce_add: diff --git a/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll index 393cd5c7f52e2..52e5ecf9cb8a1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fptosi-sat.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; Float diff --git a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll index a7efa4b3de940..02b43c2d95295 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fptoui-sat.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK64 ; Float diff --git a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll index 7fac8949c5517..3fd37384ada9b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll index ccfe94ecad286..54f56eadf0034 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll @@ -559,7 +559,7 @@ define @after_fsrm3( %0, @after_fsrm3( %0, undef, %0, %1, - i64 5, i64 %2) + i64 3, i64 %2) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll index aaa7a538e70fb..095f44cfb63e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This file tests the code generation for `llvm.experimental.constrained.round.*` on scalable vector type. diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll index 193773b0c89c9..fd834e9eb5275 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll index cdc01d658778b..051939d988f85 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This file tests the code generation for `llvm.experimental.constrained.roundeven.*` on scalable vector type. diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll index 052ee2d3a43cf..8514658824678 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll index 21615b516da89..d07bc2c6bf74d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @trunc_nxv1f16( %x) strictfp { diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll index b29b24a9ce7b2..2b3c952679eac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll index 5cd9996c7ba3b..4d47c265a9747 100644 --- a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zfh,+zvfh,+v -verify-machineinstrs < %s | \ +; RUN: llc -mtriple=riscv32 -mattr=+zvfh,+v -verify-machineinstrs < %s | \ ; RUN: FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+zfh,+zvfh,+v -verify-machineinstrs < %s | \ +; RUN: llc -mtriple=riscv64 -mattr=+zvfh,+v -verify-machineinstrs < %s | \ ; RUN: FileCheck %s ; ================================================================================ diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll index 71f90153b2124..e2298774a9b8d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple riscv32 -mattr=+m,+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple riscv64 -mattr=+m,+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple riscv32 -mattr=+m,+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple riscv64 -mattr=+m,+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s define @insert_nxv8i32_nxv4i32_0( %vec, %subvec) { ; CHECK-LABEL: insert_nxv8i32_nxv4i32_0: diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll index eada90e055df9..e9e1303d10768 100644 --- a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s ; Check that we are able to legalize scalable-vector loads that require widening. diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-fp.ll index 9c7ad239bcade..c301d4975e713 100644 --- a/llvm/test/CodeGen/RISCV/rvv/masked-load-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-fp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s define @masked_load_nxv1bf16(ptr %a, %mask) nounwind { ; CHECK-LABEL: masked_load_nxv1bf16: diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/masked-store-fp.ll index ddb56e0d979a1..586af50266f94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/masked-store-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/masked-store-fp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s define void @masked_store_nxv1bf16( %val, ptr %a, %mask) nounwind { ; CHECK-LABEL: masked_store_nxv1bf16: diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll index 5c14ed1e813c0..420597b009f33 100644 --- a/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll +++ b/llvm/test/CodeGen/RISCV/rvv/masked-tama.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK,RV64 declare @llvm.riscv.vle.mask.nxv1i64( diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tamu.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tamu.ll index 4098270d365a9..0e771eb7c4315 100644 --- a/llvm/test/CodeGen/RISCV/rvv/masked-tamu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/masked-tamu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK,RV64 declare @llvm.riscv.vle.mask.nxv1i64( diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tuma.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tuma.ll index 4cd7e143be66c..c4c2fc88913bb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/masked-tuma.ll +++ b/llvm/test/CodeGen/RISCV/rvv/masked-tuma.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK,RV64 declare @llvm.riscv.vle.mask.nxv1i64( diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-tumu.ll b/llvm/test/CodeGen/RISCV/rvv/masked-tumu.ll index c8719e6a2e7c3..ec0ebb10e8f7a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/masked-tumu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/masked-tumu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh\ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh\ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK,RV64 declare @llvm.riscv.vle.mask.nxv1i64( diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index 189ba08dddc7a..9f6fc3b5d7d10 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV64 -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV64 diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll index 29db67b4b0a41..f75f8dfedc543 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV64 -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,RV64 diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll index a6c6db345032e..20296c09998b8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-UNKNOWN -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=256 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-256 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-512 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-UNKNOWN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=256 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-256 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-512 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-UNKNOWN -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=256 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-256 -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-512 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-UNKNOWN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=256 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-256 -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-512 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-UNKNOWN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=256 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-256 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-512 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-UNKNOWN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=256 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-256 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-512 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-UNKNOWN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=256 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-256 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32-BITS-512 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-UNKNOWN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=256 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-256 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=512 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64-BITS-512 ; ; VECTOR_REVERSE - masks diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll index 5aa773b01e692..12c7b9cf13e11 100644 --- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll index 8bb62eaa8e9e9..dbd4224c7ef08 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll @@ -9,36 +9,54 @@ define <4 x float> @foo(ptr %0) nounwind { ; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: lhu s0, 0(a0) ; CHECK-NEXT: lhu s1, 2(a0) ; CHECK-NEXT: lhu s2, 4(a0) ; CHECK-NEXT: lhu a0, 6(a0) ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 4(sp) -; CHECK-NEXT: fmv.w.x fa0, s2 +; CHECK-NEXT: fmv.w.x fa5, s2 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: fmv.s fa0, fa5 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 12(sp) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: fmv.w.x fa0, s1 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 8(sp) -; CHECK-NEXT: fmv.w.x fa0, s0 +; CHECK-NEXT: fmv.w.x fa5, s0 +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: fmv.s fa0, fa5 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 0(sp) -; CHECK-NEXT: addi a0, sp, 4 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: addi a0, sp, 12 -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: addi a0, sp, 8 -; CHECK-NEXT: vle32.v v11, (a0) -; CHECK-NEXT: mv a0, sp -; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v9, 1 -; CHECK-NEXT: vslideup.vi v8, v11, 1 +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll index a454f9dd97ceb..7b6027991c320 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll index a4936483e8a15..6f5c1eab7f07a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll index 9857009002eb9..447962a7542f1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll index 11830c924867b..7249069294c46 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 5ba4efa8458c7..3e6f8953a515f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll index 3c19616576f55..fbe1a97c201cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll @@ -13,10 +13,8 @@ declare @llvm.riscv.sf.vfnrclip.x.f.qf.nxv1i8.nxv1f32.iXLen( define @intrinsic_sf_vfnrclip_x_f_qf_nxv1i8_nxv1f32( %0, float %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv1i8_nxv1f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fsrmi a1, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: sf.vfnrclip.x.f.qf v9, v8, fa0 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret entry: @@ -24,7 +22,7 @@ entry: undef, %0, float %1, - iXLen 0, iXLen %2) + iXLen 7, iXLen %2) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll index dbcee311c6e35..dfb0ccd982e84 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll @@ -13,10 +13,8 @@ declare @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv1i8.nxv1f32.iXLen( define @intrinsic_sf_vfnrclip_xu_f_qf_nxv1i8_nxv1f32( %0, float %1, iXLen %2) nounwind { ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv1i8_nxv1f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fsrmi a1, 0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma ; CHECK-NEXT: sf.vfnrclip.xu.f.qf v9, v8, fa0 -; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret entry: @@ -24,7 +22,7 @@ entry: undef, %0, float %1, - iXLen 0, iXLen %2) + iXLen 7, iXLen %2) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll index 8e422fa6f76b3..2e6df11840179 100644 --- a/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll +++ b/llvm/test/CodeGen/RISCV/rvv/splat-vectors.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s define @splat_c3_nxv4i32( %v) { ; CHECK-LABEL: splat_c3_nxv4i32: diff --git a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll index 1491bb6c337a0..b5613a4a63588 100644 --- a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll +++ b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple riscv32 -mattr=+d,+zfh,+zvfh,+v < %s \ +; RUN: llc -mtriple riscv32 -mattr=+d,+zvfh,+v < %s \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+zvfh,+v < %s \ +; RUN: llc -mtriple riscv64 -mattr=+d,+zvfh,+v < %s \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: llc -mtriple riscv32 -mattr=+d,+zfh,+zvfh,+v,+unaligned-vector-mem < %s \ +; RUN: llc -mtriple riscv32 -mattr=+d,+zvfh,+v,+unaligned-vector-mem < %s \ ; RUN: -verify-machineinstrs | FileCheck --check-prefix=FAST %s -; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+zvfh,+v,+unaligned-vector-mem < %s \ +; RUN: llc -mtriple riscv64 -mattr=+d,+zvfh,+v,+unaligned-vector-mem < %s \ ; RUN: -verify-machineinstrs | FileCheck --check-prefix=FAST %s diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll index 1bfc0f432eb55..6b0abeef657eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; Test that we can remove trivially-undef VP operations of various kinds. diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll index bcc7bb9f072f6..8640b61e64628 100644 --- a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll +++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK,RV64 declare @llvm.riscv.vfmacc.nxv1f32.nxv1f32( diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll index aeb3f6c174859..3052108a12e3e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK,RV64 declare @llvm.riscv.vle.nxv1i8( diff --git a/llvm/test/CodeGen/RISCV/rvv/vcompress.ll b/llvm/test/CodeGen/RISCV/rvv/vcompress.ll index b763e116a9f62..5ee82e6d95d4d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vcompress.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vcompress.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfhmin,+zvfbfmin \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+zvfbfmin \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ ; RUN: -verify-machineinstrs | FileCheck %s declare @llvm.riscv.vcompress.nxv1i8( diff --git a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll index e8a7d79075859..ccf82b93d6b75 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vcopysign-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index eb02fd895f18d..bc32518b67195 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s ; Integers @@ -90,6 +90,64 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { ret {<2 x i64>, <2 x i64>} %retval } +define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_v4i64_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v14, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vadd.vi v15, v14, -4 +; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 4 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vrgatherei16.vv v12, v16, v15, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vadd.vi v15, v14, 1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vrgatherei16.vv v10, v8, v15 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vadd.vi v8, v14, -3 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vrgatherei16.vv v10, v16, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret + %retval = call {<4 x i64>, <4 x i64>} @llvm.vector.deinterleave2.v8i64(<8 x i64> %vec) + ret {<4 x i64>, <4 x i64>} %retval +} + +define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_v8i64_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vadd.vv v7, v8, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vrgatherei16.vv v8, v16, v7 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, -16 +; CHECK-NEXT: vadd.vi v12, v7, -8 +; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v24, v16, 8 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; CHECK-NEXT: vrgatherei16.vv v8, v24, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vadd.vi v20, v7, 1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vadd.vi v16, v7, -7 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; CHECK-NEXT: vrgatherei16.vv v12, v24, v16, v0.t +; CHECK-NEXT: ret + %retval = call {<8 x i64>, <8 x i64>} @llvm.vector.deinterleave2.v16i64(<16 x i64> %vec) + ret {<8 x i64>, <8 x i64>} %retval +} + declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>) declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>) declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>) @@ -176,9 +234,41 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double ret {<2 x double>, <2 x double>} %retval } +define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f64_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v14, v12, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vadd.vi v15, v14, -4 +; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 4 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vrgatherei16.vv v12, v16, v15, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vadd.vi v15, v14, 1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vrgatherei16.vv v10, v8, v15 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vadd.vi v8, v14, -3 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vrgatherei16.vv v10, v16, v8, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +%retval = call {<4 x double>, <4 x double>} @llvm.vector.deinterleave2.v8f64(<8 x double> %vec) +ret {<4 x double>, <4 x double>} %retval +} + declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>) declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>) declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>) declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>) declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>) declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 54373d94f8f5f..2521e4d707b1f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m | FileCheck --check-prefixes=CHECK,RV32 %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+m | FileCheck --check-prefixes=CHECK,RV64 %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh,+m | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+m | FileCheck --check-prefixes=CHECK,RV64 %s ; Integers diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index 28f7eb4329e3b..499eee1819a4a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s ; Integers @@ -84,6 +84,22 @@ define {, } @vector_deinterleave_nxv2i64_nxv ret {, } %retval } +define {, } @vector_deinterleave_nxv4i64_nxv8i64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i64_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vadd.vv v24, v16, v16 +; CHECK-NEXT: vrgather.vv v16, v8, v24 +; CHECK-NEXT: vadd.vi v24, v24, 1 +; CHECK-NEXT: vrgather.vv v0, v8, v24 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vmv4r.v v12, v0 +; CHECK-NEXT: ret +%retval = call {, } @llvm.vector.deinterleave2.nxv8i64( %vec) +ret {, } %retval +} + declare {, } @llvm.vector.deinterleave2.nxv32i1() declare {, } @llvm.vector.deinterleave2.nxv32i8() declare {, } @llvm.vector.deinterleave2.nxv16i16() diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll index 99872c199a1e0..e730ae230d5a0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck -check-prefixes=CHECK,RV32 %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck -check-prefixes=CHECK,RV64 %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvbb,+zfh,+zvfh | FileCheck %s --check-prefix=ZVBB -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvbb,+zfh,+zvfh | FileCheck %s --check-prefix=ZVBB +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck -check-prefixes=CHECK,RV64 %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvbb,+zvfh | FileCheck %s --check-prefix=ZVBB +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvbb,+zvfh | FileCheck %s --check-prefix=ZVBB ; Integers diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index a06aa2d02b11b..8fc6bb6e2b7ac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh | FileCheck --check-prefixes=CHECK,RV32 %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck --check-prefixes=CHECK,RV64 %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck --check-prefixes=CHECK,RV64 %s ; Integers diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 83c235d8e87ab..362d8a8f372d8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvbb,+zfh,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvbb,+zfh,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB ; Integers diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll index c8313c902697b..1d8638844af7f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll index b9be6eb1fa373..42b71d412fde4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll index 53be153f8ff2d..c6c92db62bf69 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll index c3c0958f7096d..53a13b511a799 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll index 4c298ab2b5e6d..00ff3456a8e2d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd.ll index ae7d7d5d19627..e5f7545eea6fa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfhmin,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfhmin,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s -; RUN: sed 's/iXLen/i32/g' %s | not --crash llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfhmin \ +; RUN: sed 's/iXLen/i32/g' %s | not --crash llc -mtriple=riscv32 -mattr=+v,+zvfhmin \ ; RUN: -target-abi=ilp32d 2>&1 | FileCheck %s --check-prefixes=ZVFMIN -; RUN: sed 's/iXLen/i64/g' %s | not --crash llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin \ +; RUN: sed 's/iXLen/i64/g' %s | not --crash llc -mtriple=riscv64 -mattr=+v,+zvfhmin \ ; RUN: -target-abi=lp64d 2>&1 | FileCheck %s --check-prefixes=ZVFMIN ; ZVFMIN: LLVM ERROR: Cannot select: intrinsic %llvm.riscv.vfadd diff --git a/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll index c97278480f1a8..862a8355d4321 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfclass-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfclass-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfclass-vp.ll index be2d576597da2..36e1bea1f9994 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfclass-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfclass-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @isnan_nxv2f16( %x, %m, i32 zeroext %evl) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfclass.ll b/llvm/test/CodeGen/RISCV/rvv/vfclass.ll index 7c7cdab19aaea..293300bd8dd69 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfclass.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfclass.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfclass.nxv1i16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll index 1cc9ea029d457..21c5f757e4558 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.experimental.constrained.fcmp.nxv1f16(, , metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll index 9a10359228e55..56284d90a146b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.experimental.constrained.fcmps.nxv1f16(, , metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll index c1e1450d0b0a2..b28981ff196ad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-x.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-x.ll index bc8440920cd86..87d7885d44103 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-x.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-x.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfcvt.f.x.v.nxv1f16.nxv1i16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-xu.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-xu.ll index 9cf47f993ee46..1557e33dd7737 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-xu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-f-xu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfcvt.f.xu.v.nxv1f16.nxv1i16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-rtz-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-rtz-x-f.ll index 1caddaf3feeca..3b641ea5bdf45 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-rtz-x-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-rtz-x-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfcvt.rtz.x.f.v.nxv1i16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-rtz-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-rtz-xu-f.ll index 42e55a5f170e2..2fdb4b13acc98 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-rtz-xu-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-rtz-xu-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfcvt.rtz.xu.f.v.nxv1i16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-x-f.ll index 582c302dd2a15..2ea0f668dc211 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-x-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-x-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfcvt.x.f.v.nxv1i16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfcvt-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfcvt-xu-f.ll index 708b38b8ed116..a7a742d12dc67 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfcvt-xu-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfcvt-xu-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfcvt.xu.f.v.nxv1i16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll index aa59732e1e1e5..ab517de846b0f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll index f7db2be35d720..8d0c3bcf16756 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll index 0fe6c5dec4264..cdc0dc0dbca3e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv.ll index 3f67c433bcbf1..03094db580596 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfdiv.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll index ab8a595dde5d7..35bed86d61176 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fma.nxv1f64(, , , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index f0c74d064016a..a65c2fba81acb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmacc-vp.ll index 54855e6152b95..ef583b748b9c2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmacc-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fma.nxv1f16(, , , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll index 5586b52b64ec4..1f0db104df7aa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfmacc.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll index dea411348ce54..50bf6eb78044f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll index 2df2212c43db0..1de8ce51bfe32 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll index c44690d23f082..fb04888a84dea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfmadd.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll index b5604add6d25b..8ff7453a0e9a7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll index 6e38881b4d60f..345a05d98f77b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax.ll index 458815c98b258..52067e00a54bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmax.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmax.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfmax.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll b/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll index e47c2a47d6c64..e227cff7054fa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vmerge.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll index 9212ddab5b1eb..de49aed6e52b2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll index f1d6b2100ae98..6f153acda01e2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin.ll index 842c78dce02f8..a69bb9e3d6c02 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmin.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmin.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfmin.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsac-vp.ll index f1d5562131b8e..31369b69bee15 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmsac-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fma.nxv1f16(, , , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll index 4eac7b63fd881..319c945435402 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfmsac.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll index 7ec241bf74247..8bd82336de562 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-sdnode.ll index 433b0d1cbdd85..1f99d0e3a5b4b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-sdnode.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs -early-live-intervals < %s | FileCheck %s ; This tests a mix of vfmsac and vfmsub by using different operand orders to diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll index 626b40e132c73..23b4479fa8c94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfmsub.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll index 999b06ba5a579..9a68da58096d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll index 2ab04a45c8183..bbacbaa8e5e49 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll index 3114fb5d3bfa3..7112cf3b76835 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul.ll index b73d03fe36c73..03084ebc3ae30 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfmul.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll index abda6750e5a8a..ceefe709fe2a3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fmuladd.nxv1f16(, , , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll index af1c378c56812..3779b0ab18d8a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+zfh,+zvfh -target-abi lp64d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+zfh,+zvfh -target-abi ilp32d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+zvfh -target-abi lp64d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+zvfh -target-abi ilp32d -verify-machineinstrs < %s | FileCheck %s declare half @llvm.riscv.vfmv.f.s.nxv1f16() diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll index 1e863a4adbc21..912dfe499016f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK,RV64 declare @llvm.riscv.vfmv.s.f.nxv1f16(, half, iXLen) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll index 237ef11d154ba..a3d3443e48c6d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfmv.v.f.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-f.ll index 183ffa8a668a0..83250a0f90eaf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-f.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfhmin \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfncvt.f.f.w.nxv1f16.nxv1f32( , diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-x.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-x.ll index aef119faf5f7f..81b684978bafc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-x.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-x.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfncvt.f.x.w.nxv1f16.nxv1i32( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-xu.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-xu.ll index bc287e4bdef16..697c062c7a71f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-xu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-f-xu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfncvt.f.xu.w.nxv1f16.nxv1i32( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-f-f.ll index f5a019d3152dd..c0e5c6991aec2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-f-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-f-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfncvt.rod.f.f.w.nxv1f16.nxv1f32( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-f.ll index 65373bfbdb44c..4079e1c055c5e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-f.ll index cafffa0d51f56..9ce9fbfa8f19c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-f.ll index 334d5eba03001..5831bb33ff90e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-f.ll index bea99a0e81a34..3ef0a95197198 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll index b4ec691796a71..9f456e97be11d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll index af2668a9b0c54..6fa6c26890c3e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmacc-vp.ll index ee3ed603ff6d7..3b5cbb685a424 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmacc-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fma.nxv1f16(, , , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll index 01f4715274b6c..31df27853cb3c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfnmacc.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll index 5ec089a2dcac8..332ab89b25856 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-sdnode.ll index 61acb88b17bd7..07c85bc67339b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This tests a mix of vfnmacc and vfnmadd by using different operand orders to diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll index ae4cfef35e61d..6f41ed177beac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfnmadd.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsac-vp.ll index 14dba24daf5ff..edeb554bc6d35 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsac-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fma.nxv1f16(, , , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll index 071f546b4f609..50497d92764a5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfnmsac.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll index 286492bce2960..8b968017841f8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfhmin,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-sdnode.ll index 72f64b23f7584..a356da80e1639 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s ; This tests a mix of vfnmsac and vfnmsub by using different operand orders to diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll index 4922cf40e5038..c5d5bb1fe0b3e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfnmsub.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll index 2375f5def3dab..3999b97d6e256 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.experimental.constrained.fpext.nxv1f32.nxv1f16(, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll index 2f8b1d501f003..3b2de0185f90c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-sdnode.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @vfpext_nxv1f16_nxv1f32( %va) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll index 9d10b0209cbe7..5962d38b1baa7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fpext.nxv2f32.nxv2f16(, , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-constrained-sdnode.ll index 47f68837cd578..6ebdcec4a403c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.experimental.constrained.fptosi.nxv1i1.nxv1f16(, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll index 4edaa3825e587..37e14783d1873 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll index 2e9ceadb96592..b7f2133144e71 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define @vfptosi_nxv2i1_nxv2bf16( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vfptosi_nxv2i1_nxv2bf16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll index f42b603509c22..a8ea062211337 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define @vfptosi_v4i7_v4bf16( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vfptosi_v4i7_v4bf16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll index 2cf158ddbd50d..8ac5992bd5eb9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define @vfptoui_nxv2i1_nxv2bf16( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vfptoui_nxv2i1_nxv2bf16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll index 403bc595b9bbd..9062d8a6f2024 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define @vfptoui_v4i7_v4bf16( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vfptoui_v4i7_v4bf16: diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-constrained-sdnode.ll index 65e753445aceb..566920d577ce1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-constrained-sdnode.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.experimental.constrained.fptrunc.nxv1f32.nxv1f64(, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-sdnode.ll index e930e1fe42f0e..dcec2200b1308 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-sdnode.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @vfptrunc_nxv1f32_nxv1f16( %va) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll index da16feeddecd7..16c8fa728500e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+m,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+m,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fptrunc.nxv2f16.nxv2f32(, , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfrdiv-vp.ll index 876f8d9456386..e8688abc63a5d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfrdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfrdiv-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fdiv.nxv1f16(, , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrdiv.ll b/llvm/test/CodeGen/RISCV/rvv/vfrdiv.ll index f73e7dce92120..9d29db4b1868e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfrdiv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfrdiv.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfrdiv.nxv1f16.f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrec7.ll b/llvm/test/CodeGen/RISCV/rvv/vfrec7.ll index 914b3b33fbe5e..98d82144a3334 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfrec7.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfrec7.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfrec7.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfredmax.ll b/llvm/test/CodeGen/RISCV/rvv/vfredmax.ll index 4219abbbaa1d8..f1ed95512741c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfredmax.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfredmax.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfredmax.nxv4f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfredmin.ll b/llvm/test/CodeGen/RISCV/rvv/vfredmin.ll index 9fcd233fdc142..5dfa5a1f2b20e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfredmin.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfredmin.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfredmin.nxv4f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfredosum.ll b/llvm/test/CodeGen/RISCV/rvv/vfredosum.ll index bb489e0f380ca..a85850b0c4504 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfredosum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfredosum.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfredosum.nxv4f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfredusum.ll b/llvm/test/CodeGen/RISCV/rvv/vfredusum.ll index c1463102c8e68..b3101450493e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfredusum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfredusum.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfredusum.nxv4f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7.ll index f13fae2614eb6..97d6e2924178f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfrsqrt7.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsub-vp.ll index bd941dc1a7772..e2864ea30ec7b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfrsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfrsub-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.fsub.nxv1f16(, , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsub.ll index 1104753419883..c3406c2730516 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfrsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfrsub.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfrsub.nxv1f16.f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnj.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnj.ll index e7f0b7ab8a892..73aaf32471db8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsgnj.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnj.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfsgnj.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnjn.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnjn.ll index 2ac48e0b9f9dc..06dc5656bb6c8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsgnjn.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnjn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfsgnjn.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnjx.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnjx.ll index b9bbd8982d743..891cda277a444 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsgnjx.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnjx.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfsgnjx.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfslide1down.ll b/llvm/test/CodeGen/RISCV/rvv/vfslide1down.ll index 9317a8a21f494..dd036d1e1724d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfslide1down.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfslide1down.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfslide1down.nxv1f16.f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/vfslide1up.ll index c71cc13566f6d..6eead91c17873 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfslide1up.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfslide1up.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfslide1up.nxv1f16.f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll index 9da1e0a576d5b..d92db0b5a3a7b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll index de31a02cd1545..a51b0e4efecf2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll index 574c2e0526301..00542284ebaee 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt.ll index 3e3eea9f353c8..500a07ad87edf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfsqrt.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll index e40427a305f61..9b24b1df0f064 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll index e56cfd9ee4eb1..a2137eaa7a958 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll index dd57b65b50f4f..02647c1927c25 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zfbfmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub.ll index 04590a5223665..96c915c6dbf1e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfsub.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll index d7f5b109aa7cb..68014ff4206f8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @vfwadd_vv_nxv1f64( %va, %vb) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll index b42a1fe46e670..d980803cb3897 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwadd.nxv1f32.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll index 76246eba9480d..8eb2a2c0391b5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd.w.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwadd.w.nxv1f32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll index 89c5d5a9a68f1..4f03188cf3806 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-f.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfhmin \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1f16( , diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll index cc8eeaaba256d..3c1e07b4a5ef4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-x.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwcvt.f.x.v.nxv1f16.nxv1i8( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll index 841278924d0f6..17ea8f50a6943 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-f-xu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwcvt.f.xu.v.nxv1f16.nxv1i8( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll index f3a73e4fa3639..f9f426cd3c9b6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-x-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwcvt.rtz.x.f.v.nxv1i32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll index d1d70aeee45e8..61d2ad5bf892a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-rtz-xu-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwcvt.rtz.xu.f.v.nxv1i32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll index 9a80e02bbbbb4..8b545585c56d0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-x-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwcvt.x.f.v.nxv1i32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll index 98caaf91ab3c0..476e2398c479d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-xu-f.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwcvt.xu.f.v.nxv1i32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll index 225ba1c14031f..354f169561735 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwmacc.nxv1f32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll index 5e3f63b95b2f9..bd0d616fa6176 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwmsac.nxv1f32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll index 6b16171721f1d..f00ff4b6d2cec 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @vfwmul_vv_nxv1f64( %va, %vb) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll index bc5759f469ad0..dae29efc75bf8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmul.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwmul.nxv1f32.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll index fc8e15273f085..e1db8cb722760 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwnmacc.nxv1f32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll index b51faf9082c8f..5c62112aa9e3d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwnmsac.nxv1f32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwredosum.ll b/llvm/test/CodeGen/RISCV/rvv/vfwredosum.ll index dbf7e27d318e5..bbb019f2f5892 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwredosum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwredosum.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwredosum.nxv2f32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwredusum.ll b/llvm/test/CodeGen/RISCV/rvv/vfwredusum.ll index 9710051186c8d..05044ef689a92 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwredusum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwredusum.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwredusum.nxv2f32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll index 787de48be7f0d..b9f66d5d30825 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s define @vfwsub_vv_nxv1f64( %va, %vb) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll index 0e3e5f8aabfd3..4f263c63e545b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwsub.nxv1f32.nxv1f16.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll index 90f92226dcdd9..fdb48a6f10d3a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub.w.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vfwsub.w.nxv1f32.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vitofp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vitofp-constrained-sdnode.ll index 90e5f58a603a5..f25a27ca2b905 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vitofp-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vitofp-constrained-sdnode.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.experimental.constrained.sitofp.nxv1f16.nxv1i1(, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll index f5f8ee91c31c4..0f76968485fb4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+zvfbfmin \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zvfbfmin \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v,+zvfbfmin \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v,+zvfbfmin \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ ; RUN: --check-prefixes=CHECK,ZVFHMIN diff --git a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll index 481505a2095cb..4f7286aeeda1e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2), ptr, , i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll index c308512753f2d..7b1d545ff9e9e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2), ptr, , i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll index b96874fe90982..e6a98c90037d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare target("riscv.vector.tuple", , 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll index d6cbf362e7ece..809ae2d2bebfe 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare target("riscv.vector.tuple", , 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32.ll index 0e4915895ef34..b89097b8ff974 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare {target("riscv.vector.tuple", , 2), i32} @llvm.riscv.vlseg2ff.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64.ll index 632fbc1e4431d..68acb3beb0686 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare {target("riscv.vector.tuple", , 2), i64} @llvm.riscv.vlseg2ff.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vlsseg-rv32.ll index 4b475dd96e00e..a87d51692227f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vlsseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vlsseg-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare target("riscv.vector.tuple", , 2) @llvm.riscv.vlsseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i32, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vlsseg-rv64.ll index 6cc95979eb13e..7b5421fba3dcc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vlsseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vlsseg-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare target("riscv.vector.tuple", , 2) @llvm.riscv.vlsseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll index 10bfdec0e2c99..d1ca40bcc0db3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2), ptr, , i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll index 28f70ce08bfe1..3b9db2655e033 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2), ptr, , i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll index 2e5b67c93fce1..9ca78c872befd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vmfeq.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfge.ll b/llvm/test/CodeGen/RISCV/rvv/vmfge.ll index b5ca47707c8a8..7cf18a7015812 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmfge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmfge.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vmfge.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll b/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll index 971249d38d1b2..b78f2da4ae254 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmfgt.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vmfgt.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfle.ll b/llvm/test/CodeGen/RISCV/rvv/vmfle.ll index f19a181a365af..940e4d043f63f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmfle.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmfle.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vmfle.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vmflt.ll b/llvm/test/CodeGen/RISCV/rvv/vmflt.ll index 0a04642219334..10ddfb8f014ed 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmflt.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmflt.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vmflt.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfne.ll b/llvm/test/CodeGen/RISCV/rvv/vmfne.ll index 520099247e0f3..4d8a95de1d3de 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmfne.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmfne.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh \ ; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s declare @llvm.riscv.vmfne.nxv1f16( diff --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll index 84c8321b5b934..34f8f35ee98c0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 declare @llvm.vp.gather.nxv1i8.nxv1p0(, , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll index 0a98b672fb19c..5683a7b758854 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.load.nxv1i8.p0(ptr, , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll index 0028f3035c273..329f97da64ea8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v,+m \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v,+m \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v,+m \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32 -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v,+m \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v,+m \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64 declare void @llvm.vp.scatter.nxv1i8.nxv1p0(, , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll index d935e52149d20..7168b07e81971 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+zvfbfmin,+v \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.vp.store.nxv1i8.p0(, ptr, , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll index f3ccf74019bb5..13d1ac5088479 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare half @llvm.vp.reduce.fadd.nxv1f16(half, , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll index d1e947e2f3367..f386fd9cd3aeb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \ ; RUN: -verify-machineinstrs | FileCheck %s declare @llvm.riscv.vrgatherei16.vv.nxv1i8( diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll index ee0617c931480..b7e2c92350a2f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+m,+zfh,+zvfh,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+m,+zvfh,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+m,+zfh,+zvfh,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+m,+zvfh,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+m,+zfh,+zvfhmin,+v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+m,+zvfhmin,+v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+m,+zfh,+zvfhmin,+v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+m,+zvfhmin,+v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.select.nxv1i1(, , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll index d1c2cf325bec4..ec16e58f6e57d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s define @vsitofp_nxv2bf16_nxv2i1( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vsitofp_nxv2bf16_nxv2i1: diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll index d163988b3d41c..8b6e437fbc0a3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define @vsitofp_nxv2bf16_nxv2i7( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vsitofp_nxv2bf16_nxv2i7: diff --git a/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll b/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll index fc26ac25fe081..2be187c50af26 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \ ; RUN: -verify-machineinstrs | FileCheck %s declare @llvm.riscv.vslidedown.nxv1i8( diff --git a/llvm/test/CodeGen/RISCV/rvv/vslideup.ll b/llvm/test/CodeGen/RISCV/rvv/vslideup.ll index 4880bf2bc66dd..1e3ede7fee9cb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vslideup.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vslideup.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \ ; RUN: -verify-machineinstrs | FileCheck %s declare @llvm.riscv.vslideup.nxv1i8( diff --git a/llvm/test/CodeGen/RISCV/rvv/vsoxseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsoxseg-rv32.ll index c24895a0e6380..6b54ce4974f34 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsoxseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsoxseg-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2), ptr, , i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsoxseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsoxseg-rv64.ll index c8e7c43754058..70fb9c2b348d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsoxseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsoxseg-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2), ptr, , i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsseg-rv32.ll index 330ec59d3459c..7b80d45a924d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsseg-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsseg-rv64.ll index 877eeeaf10039..6ce326be23ee3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsseg-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vssseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vssseg-rv32.ll index df443d6f4d93a..a0a583c046c49 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vssseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vssseg-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.riscv.vssseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i32, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vssseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vssseg-rv64.ll index dd6faad09f49a..bdd809841d2d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vssseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vssseg-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.riscv.vssseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsuxseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsuxseg-rv32.ll index 9119d42ba0aee..316c7ccb7e415 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsuxseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsuxseg-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2), ptr, , i32, i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsuxseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsuxseg-rv64.ll index 82698e6da2abf..22be2ebca8fde 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsuxseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsuxseg-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh,+zvfbfmin \ +; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2), ptr, , i64, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll index 5426102efc73e..616dc697b2847 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s define @vuitofp_nxv2bf16_nxv2i1( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vuitofp_nxv2bf16_nxv2i1: diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll index 7c96a9e9e10f6..499bd4aa667c3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define @vuitofp_nxv2bf16_nxv2i7( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vuitofp_nxv2bf16_nxv2i7: diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index 0ee067b673da9..b887036372f7b 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -329,6 +329,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: mv s0, a0 ; RV32-NEXT: lbu a0, 4(a0) ; RV32-NEXT: lw a1, 0(s0) @@ -351,6 +352,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32-NEXT: addi a0, a0, -1638 ; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: sltiu s1, a0, 2 +; RV32-NEXT: xori s4, s1, 1 ; RV32-NEXT: li a1, 1463 ; RV32-NEXT: mv a0, s2 ; RV32-NEXT: call __mulsi3 @@ -358,23 +360,22 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: sltiu a0, a0, 293 ; RV32-NEXT: addi s3, s3, -1 -; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: addi s1, s1, -1 -; RV32-NEXT: slli a1, s1, 21 -; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: andi a2, s3, 2047 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: andi a1, s3, 2047 ; RV32-NEXT: andi a0, a0, 2047 ; RV32-NEXT: slli a0, a0, 11 ; RV32-NEXT: slli s1, s1, 22 ; RV32-NEXT: or a0, a0, s1 -; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: sw a0, 0(s0) -; RV32-NEXT: sb a1, 4(s0) +; RV32-NEXT: sb s4, 4(s0) ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; @@ -457,16 +458,15 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32M-NEXT: addi a1, a1, -1638 ; RV32M-NEXT: andi a1, a1, 2047 ; RV32M-NEXT: sltiu a1, a1, 2 -; RV32M-NEXT: li a4, 1463 -; RV32M-NEXT: mul a3, a3, a4 +; RV32M-NEXT: xori a4, a1, 1 +; RV32M-NEXT: li a5, 1463 +; RV32M-NEXT: mul a3, a3, a5 ; RV32M-NEXT: addi a3, a3, -1463 ; RV32M-NEXT: andi a3, a3, 2047 ; RV32M-NEXT: sltiu a3, a3, 293 ; RV32M-NEXT: addi a2, a2, -1 -; RV32M-NEXT: addi a3, a3, -1 ; RV32M-NEXT: addi a1, a1, -1 -; RV32M-NEXT: slli a4, a1, 21 -; RV32M-NEXT: srli a4, a4, 31 +; RV32M-NEXT: addi a3, a3, -1 ; RV32M-NEXT: andi a2, a2, 2047 ; RV32M-NEXT: andi a3, a3, 2047 ; RV32M-NEXT: slli a3, a3, 11 diff --git a/llvm/test/CodeGen/SPIRV/ShaderBufferImage.ll b/llvm/test/CodeGen/SPIRV/ShaderBufferImage.ll index 1f203043e6a16..760621c034086 100644 --- a/llvm/test/CodeGen/SPIRV/ShaderBufferImage.ll +++ b/llvm/test/CodeGen/SPIRV/ShaderBufferImage.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-vulkan-library %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %} ; CHECK-NOT: OpCapability ImageBasic diff --git a/llvm/test/CodeGen/SPIRV/ShaderImage.ll b/llvm/test/CodeGen/SPIRV/ShaderImage.ll index 9cd5fb3380805..e6235fc763257 100644 --- a/llvm/test/CodeGen/SPIRV/ShaderImage.ll +++ b/llvm/test/CodeGen/SPIRV/ShaderImage.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-vulkan-library %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: [[Float:%[0-9]+]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/basic_int_types.ll b/llvm/test/CodeGen/SPIRV/basic_int_types.ll index e85e0919d1793..5aa7aaf6fbd01 100644 --- a/llvm/test/CodeGen/SPIRV/basic_int_types.ll +++ b/llvm/test/CodeGen/SPIRV/basic_int_types.ll @@ -1,4 +1,3 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} diff --git a/llvm/test/CodeGen/SPIRV/basic_int_types_spirvdis.ll b/llvm/test/CodeGen/SPIRV/basic_int_types_spirvdis.ll index 0d7b2b99f64ea..56b5f48715533 100644 --- a/llvm/test/CodeGen/SPIRV/basic_int_types_spirvdis.ll +++ b/llvm/test/CodeGen/SPIRV/basic_int_types_spirvdis.ll @@ -1,4 +1,3 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" ; REQUIRES: spirv-tools ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_DispatchThreadID.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_DispatchThreadID.ll index 2d8692adf12a2..7243977c68a02 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_DispatchThreadID.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/SV_DispatchThreadID.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} ; This file generated from the following command: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll index d0a56854c32f8..67e8847a2945f 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveGetLaneIndex.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} ; This file generated from the following command: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll index c3e894afd710b..3b4ff4f293c64 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/acos.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/acos.ll index 1936f6d272073..9726bfbf1ada3 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/acos.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/acos.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll index 1edd69e2b0d5b..0980a28f4e8ee 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll @@ -1,6 +1,5 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-HLSL -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-OCL +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-HLSL +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-OCL ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; Make sure spirv operation function calls for all are generated. diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll index dc6e9dc203305..5d805202be5a7 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll @@ -1,6 +1,5 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-HLSL -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-OCL +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-HLSL +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-OCL ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; Make sure spirv operation function calls for any are generated. diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/asin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/asin.ll index be338f22bf125..73b29b6a264be 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/asin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/asin.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan.ll index 5d352eb80af2d..f0ee310d3020c 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll index aba6f7583b683..922e017b0d5a0 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ceil.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ceil.ll index 2c36459bdac95..29ac39ef9b418 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ceil.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/ceil.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll index 937a545cc563c..09f1cf07267fe 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cos.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cosh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cosh.ll index 2d7a4caada7d5..bc6dd75bce94c 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cosh.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cosh.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll index d47ec3ec27aa1..4fe0a6fff50d9 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/countbits.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpMemoryModel Logical GLSL450 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll index 6915362001288..5945bc390744a 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll index 43bb8e217a670..949f750db2dc9 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp2.ll index ae6c33cb0c7ef..9e24c46cfcdef 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp2.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/exp2.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/floor.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/floor.ll index 1ecaafc22e6fa..1f79cc951656f 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/floor.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/floor.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll index add94601bd168..0bb969021de3d 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmax.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmax.ll index b202025f5dc83..073a5c4f8b8ac 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmax.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmax.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmin.ll index 77e2ed1748e6e..d4bd1a6044e53 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmin.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll index 41c18b693574f..d660abc1b6232 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/frac.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/group_memory_barrier_with_group_sync.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/group_memory_barrier_with_group_sync.ll index 6955411a0e4e9..10310cd30088c 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/group_memory_barrier_with_group_sync.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/group_memory_barrier_with_group_sync.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpMemoryModel Logical GLSL450 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/imad.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/imad.ll index a161147c8b964..f68a77188c22d 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/imad.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/imad.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#int_16:]] = OpTypeInt 16 0 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll index 94272a84bd639..1d7f3a6ccc535 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; Make sure SPIRV operation function calls for lerp are generated as FMix diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log.ll index d5dd92042537a..20339b98411a0 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log10.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log10.ll index a829422d84ebf..10264d3ef11d3 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log10.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log10.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#extinst:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log2.ll index c71ca125c172a..7bbd4596e99f7 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log2.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/log2.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll index ddf89221be2ae..71f69438a5ae9 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; Make sure SPIRV operation function calls for normalize are lowered correctly. diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll index 38c51ca47d86c..9454143913116 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/pow.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/radians.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/radians.ll index 7aad4df76e318..26e90bd7ef708 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/radians.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/radians.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll index 9c8c14c2a7220..056673fa9d5a5 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" - ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s + ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; CHECK-DAG: %[[#float_64:]] = OpTypeFloat 64 ; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reversebits.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reversebits.ll index ce8175fdceb20..9dc43cb25f7d6 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reversebits.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/reversebits.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpMemoryModel Logical GLSL450 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/round.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/round.ll index 0c88c55cbd395..58e54b0fbf26d 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/round.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/round.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll index 33d3edc080fd7..cd9374bb180f2 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rsqrt.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll index 7474b75994513..19aec458172f4 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sin.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sinh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sinh.ll index 6a31b70218773..99da3ac81b434 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sinh.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sinh.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smax.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smax.ll index cbf0b243ab2b3..c6f17b677b47e 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smax.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smax.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smin.ll index 960de853f3afd..3c7befe1078b8 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/smin.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/splitdouble.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/splitdouble.ll index a05a31c18a754..519b36f0cd3e5 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/splitdouble.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/splitdouble.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; Make sure lowering is correctly generating spirv code. diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll index 55d8a286a0e7f..3230f56511ebe 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/sqrt.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll index eac0b85895554..f833c3256a815 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; Make sure SPIRV operation function calls for step are lowered correctly. diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll index 6e2f0698b7b6d..f0969667f98f9 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tan.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tanh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tanh.ll index 1dfdf83fee31e..712c10d019b55 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tanh.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/tanh.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/trunc.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/trunc.ll index bae614ee59676..b0f2a32d001e8 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/trunc.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/trunc.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umax.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umax.ll index e2b14b089bc13..3b3dbc997817c 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umax.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umax.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umin.ll index 708b76a93e661..1a3bcdbbcc31b 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umin.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/umin.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpExtInstImport "GLSL.std.450" diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll new file mode 100644 index 0000000000000..7a21a6c4bf7ea --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll @@ -0,0 +1,40 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK-NEXT: OpCapability SampledImageArrayDynamicIndexing +; CHECK-NEXT: OpCapability Sampled1D +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 1 R32i {{$}} +; CHECK-DAG: [[CombindedType:%[0-9]+]] = OpTypeSampledImage [[BufferType]] +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[CombindedType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[CombindedType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[CombindedType]] [[ac]] + %buffer0 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 0, i1 false) + +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[CombindedType]] [[ac]] + %buffer1 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 1, i1 false) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll new file mode 100644 index 0000000000000..b821f5bdfa137 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll @@ -0,0 +1,47 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK: OpCapability ShaderNonUniform +; CHECK-NEXT: OpCapability SampledImageArrayNonUniformIndexing +; CHECK-NEXT: OpCapability Sampled1D +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 +; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 1 R32i {{$}} +; CHECK-DAG: [[CombindedType:%[0-9]+]] = OpTypeSampledImage [[BufferType]] +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[CombindedType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[CombindedType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[ld0:%[0-9]+]] = OpLoad [[CombindedType]] [[ac0]] + %buffer0 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 0, i1 true) + +; CHECK: [[ac1]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[ld1]] = OpLoad [[CombindedType]] [[ac1]] + %buffer1 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 1, i1 true) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/HlslBufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/HlslBufferLoad.ll similarity index 95% rename from llvm/test/CodeGen/SPIRV/HlslBufferLoad.ll rename to llvm/test/CodeGen/SPIRV/hlsl-resources/HlslBufferLoad.ll index 66d5f0f4b05fe..b264227771c33 100644 --- a/llvm/test/CodeGen/SPIRV/HlslBufferLoad.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/HlslBufferLoad.ll @@ -1,5 +1,4 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" -; RUN: llc -O0 -mtriple=spirv-vulkan-library %s -o - | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll new file mode 100644 index 0000000000000..c925be1f8216a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll @@ -0,0 +1,39 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK-NEXT: OpCapability InputAttachmentArrayDynamicIndexing +; SCHECK-NEXT: OpCapability InputAttachment +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] SubpassData 2 0 0 2 Unknown {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer0 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_6_2_0_0_2_0( + i32 3, i32 4, i32 3, i32 0, i1 false) + +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer1 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_6_2_0_0_2_0( + i32 3, i32 4, i32 3, i32 1, i1 false) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll new file mode 100644 index 0000000000000..bb2e7549fd3ba --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll @@ -0,0 +1,46 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK-NEXT: OpCapability ShaderNonUniformEXT +; CHECK-NEXT: OpCapability InputAttachmentArrayNonUniformIndexing +; SCHECK-NEXT: OpCapability InputAttachment +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 +; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] SubpassData 2 0 0 2 Unknown {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]] + %buffer0 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_6_2_0_0_2_0( + i32 3, i32 4, i32 3, i32 0, i1 true) + +; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]] + %buffer1 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_6_2_0_0_2_0( + i32 3, i32 4, i32 3, i32 1, i1 true) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll new file mode 100644 index 0000000000000..69b1ac9078ff6 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll @@ -0,0 +1,65 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK-NEXT: OpCapability SampledImageArrayDynamicIndexing +; CHECK-NEXT: OpCapability Sampled1D +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 +; CHECK-DAG: OpDecorate [[OtherVar:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[OtherVar]] Binding 4 + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 1 R32i {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK-DAG: [[OtherArraySize:%[0-9]+]] = OpConstant [[int]] 5 +; CHECK-DAG: [[OtherBufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[OtherArraySize]] +; CHECK-DAG: [[OtherArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[OtherBufferArrayType]] +; CHECK-DAG: [[OtherVar]] = OpVariable [[OtherArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 0, i1 false) + +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 1, i1 false) + ret void +} + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @DifferentArraySizesAreDifferentVariables() #0 { +; Make sure we use different variables when the array sizes are different +; same in case one function calls the other. +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 0, i1 false) + +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[OtherVar]] [[One]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 5, i32 1, i1 false) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll new file mode 100644 index 0000000000000..7d1865aca6735 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll @@ -0,0 +1,46 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK-NEXT: OpCapability ShaderNonUniformEXT +; CHECK-NEXT: OpCapability SampledImageArrayNonUniformIndexing +; CHECK-NEXT: OpCapability Sampled1D +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 +; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 1 R32i {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]] + %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 0, i1 true) + +; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]] + %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 1, i1 true) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll new file mode 100644 index 0000000000000..3ca6788f0e48a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll @@ -0,0 +1,38 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK-NEXT: OpCapability SampledImageArrayDynamicIndexing +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[SamplerType:%[0-9]+]] = OpTypeSampler +; CHECK-DAG: [[SamplerPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[SamplerType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0 +; CHECK-DAG: [[SamplerArrayType:%[0-9]+]] = OpTypeArray [[SamplerType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[SamplerArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[Zero]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[SamplerType]] [[ac]] + %buffer0 = call target("spirv.Sampler") + @llvm.spv.handle.fromBinding.tspirv.Image( + i32 3, i32 4, i32 3, i32 0, i1 false) + +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[One]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[SamplerType]] [[ac]] + %buffer1 = call target("spirv.Sampler") + @llvm.spv.handle.fromBinding.tspirv.Image( + i32 3, i32 4, i32 3, i32 1, i1 false) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll new file mode 100644 index 0000000000000..0917d4751f459 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll @@ -0,0 +1,45 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK-NEXT: ShaderNonUniform +; CHECK-NEXT: OpCapability SampledImageArrayNonUniformIndexing +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 +; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[SamplerType:%[0-9]+]] = OpTypeSampler +; CHECK-DAG: [[SamplerPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[SamplerType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0 +; CHECK-DAG: [[SamplerArrayType:%[0-9]+]] = OpTypeArray [[SamplerType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[SamplerArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac0]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[Zero]] +; CHECK: [[ld0]] = OpLoad [[SamplerType]] [[ac0]] + %buffer0 = call target("spirv.Sampler") + @llvm.spv.handle.fromBinding.tspirv.Image( + i32 3, i32 4, i32 3, i32 0, i1 true) + +; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[One]] +; CHECK: [[ld1]] = OpLoad [[SamplerType]] [[ac1]] + %buffer1 = call target("spirv.Sampler") + @llvm.spv.handle.fromBinding.tspirv.Image( + i32 3, i32 4, i32 3, i32 1, i1 true) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll new file mode 100644 index 0000000000000..1922e66388324 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll @@ -0,0 +1,39 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK-NEXT: OpCapability StorageImageArrayDynamicIndexing +; CHECK-NEXT: OpCapability Image1D +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 2 R32i {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_2_24( + i32 3, i32 4, i32 3, i32 0, i1 false) + +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_2_24( + i32 3, i32 4, i32 3, i32 1, i1 false) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll new file mode 100644 index 0000000000000..231e1cf7567a0 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll @@ -0,0 +1,46 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; CHECK: OpCapability ShaderNonUniformEXT +; CHECK-NEXT: OpCapability StorageImageArrayNonUniformIndexing +; CHECK-NEXT: OpCapability Image1D +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 +; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] 1D 2 0 0 2 R32i {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]] + %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_2_24( + i32 3, i32 4, i32 3, i32 0, i1 true) + +; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]] + %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_2_24( + i32 3, i32 4, i32 3, i32 1, i1 true) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll new file mode 100644 index 0000000000000..454ba1f47db0a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll @@ -0,0 +1,39 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; SCHECK-NEXT: OpCapability ImageBuffer +; CHECK-NEXT: OpCapability StorageTexelBufferArrayDynamicIndexing +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 2 R32i {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @void() #0 { +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24( + i32 3, i32 4, i32 3, i32 0, i1 false) + +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24( + i32 3, i32 4, i32 3, i32 1, i1 false) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll new file mode 100644 index 0000000000000..a579aaa1eed69 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll @@ -0,0 +1,46 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; SCHECK-NEXT: OpCapability ImageBuffer +; CHECK-NEXT: OpCapability ShaderNonUniformEXT +; CHECK-NEXT: OpCapability StorageTexelBufferArrayNonUniformIndexingEXT +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 +; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 2 R32i {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]] + %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24( + i32 3, i32 4, i32 3, i32 0, i1 true) + +; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]] + %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24( + i32 3, i32 4, i32 3, i32 1, i1 true) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll new file mode 100644 index 0000000000000..98c4ff7a965d5 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll @@ -0,0 +1,39 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; SCHECK-NEXT: OpCapability SampledBuffer +; CHECK-NEXT: OpCapability UniformTexelBufferArrayDynamicIndexing +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 1 R32i {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One:%[0-9]+]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero:%[0-9]+]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 0, i1 false) + +; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]] + %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 1, i1 false) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll new file mode 100644 index 0000000000000..da523f215046b --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll @@ -0,0 +1,46 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Shader +; SCHECK-NEXT: OpCapability SampledBuffer +; CHECK-NEXT: OpCapability ShaderNonUniformEXT +; CHECK-NEXT: OpCapability UniformTexelBufferArrayNonUniformIndexing +; CHECK-NOT: OpCapability + +; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3 +; CHECK-DAG: OpDecorate [[Var]] Binding 4 +; CHECK: OpDecorate [[Zero:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld0:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[One:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ac1:%[0-9]+]] NonUniform +; CHECK: OpDecorate [[ld1:%[0-9]+]] NonUniform + +; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[BufferType:%[0-9]+]] = OpTypeImage [[int]] Buffer 2 0 0 1 R32i {{$}} +; CHECK-DAG: [[BufferPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferType]] +; CHECK-DAG: [[ArraySize:%[0-9]+]] = OpConstant [[int]] 3 +; CHECK-DAG: [[One]] = OpConstant [[int]] 1 +; CHECK-DAG: [[Zero]] = OpConstant [[int]] 0 +; CHECK-DAG: [[BufferArrayType:%[0-9]+]] = OpTypeArray [[BufferType]] [[ArraySize]] +; CHECK-DAG: [[ArrayPtrType:%[0-9]+]] = OpTypePointer UniformConstant [[BufferArrayType]] +; CHECK-DAG: [[Var]] = OpVariable [[ArrayPtrType]] UniformConstant + +; CHECK: {{%[0-9]+}} = OpFunction {{%[0-9]+}} DontInline {{%[0-9]+}} +; CHECK-NEXT: OpLabel +define void @main() #0 { +; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]] +; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]] + %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 0, i1 true) + +; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]] +; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]] + %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24) + @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_1_24( + i32 3, i32 4, i32 3, i32 1, i1 true) + ret void +} + +attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } diff --git a/llvm/test/CodeGen/SPIRV/literals.ll b/llvm/test/CodeGen/SPIRV/literals.ll index 86a366976a6e2..4f5aa7d4fa0e8 100644 --- a/llvm/test/CodeGen/SPIRV/literals.ll +++ b/llvm/test/CodeGen/SPIRV/literals.ll @@ -1,4 +1,3 @@ -; TODO(pull/110270): verifier, fix G_BITCAST error "bitcast must change type" ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} diff --git a/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types-rev.ll b/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types-rev.ll index 6fa3f4e53cc59..8d14c3a359963 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types-rev.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types-rev.ll @@ -1,7 +1,4 @@ -; The goal of the test case is to ensure that OpPhi is consistent with respect to operand types. -; -verify-machineinstrs is not available due to mutually exclusive requirements for G_BITCAST and G_PHI. - -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#Char:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types.ll b/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types.ll index 4fbaae2556730..07824d4ed6cd8 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/phi-valid-operand-types.ll @@ -1,7 +1,4 @@ -; The goal of the test case is to ensure that OpPhi is consistent with respect to operand types. -; -verify-machineinstrs is not available due to mutually exclusive requirements for G_BITCAST and G_PHI. - -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#Char:]] = OpTypeInt 8 0 diff --git a/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll b/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll new file mode 100644 index 0000000000000..f5d3f6ec9ec29 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-fp8 | FileCheck %s + +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK: tdpbf8ps %tmm3, %tmm2, %tmm1 +; CHECK: tdpbhf8ps %tmm3, %tmm2, %tmm1 +; CHECK: tdphbf8ps %tmm3, %tmm2, %tmm1 +; CHECK: tdphf8ps %tmm3, %tmm2, %tmm1 + +define void @test_amx(){ +call void @llvm.x86.tdpbf8ps(i8 1, i8 2, i8 3) +call void @llvm.x86.tdpbhf8ps(i8 1, i8 2, i8 3) +call void @llvm.x86.tdphbf8ps(i8 1, i8 2, i8 3) +call void @llvm.x86.tdphf8ps(i8 1, i8 2, i8 3) +ret void +} +declare void @llvm.x86.tdpbf8ps(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdpbhf8ps(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdphbf8ps(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdphf8ps(i8 %tile0, i8 %tile1, i8 %tile2) diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll new file mode 100644 index 0000000000000..4f41410010302 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s + +@buf = dso_local global [2048 x i8] zeroinitializer, align 16 +@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 + +define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { +; CHECK-LABEL: test_tile_2rpntlvwz0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-NEXT: movw %si, %cx +; CHECK-NEXT: movw %di, %ax +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $cl +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: # implicit-def: $al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $buf, %esi +; CHECK-NEXT: movl $32, %edi +; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4 +; CHECK-NEXT: movabsq $64, %rbx +; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill +; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: movabsq $64, %rbx +; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill +; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi) +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1 +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2 +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) +; CHECK-NEXT: movl $64, %edi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 +; CHECK-NEXT: movl $buf2, %edx +; CHECK-NEXT: movl $32, %esi +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) +; CHECK-NEXT: leaq -8(%rbp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3 + %1 = extractvalue { x86_amx, x86_amx } %0, 0 + %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3 + %3 = extractvalue { x86_amx, x86_amx } %0, 1 + %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3 + %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3 + %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3 + %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3 + %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3 + %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3 + %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3 + %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3 + %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3 + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3 + ret void +} + +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 + +declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 + +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 + +declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 + +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 + +attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" } +attributes #1 = { argmemonly nofree nounwind readonly } +attributes #2 = { nofree nosync nounwind readnone } +attributes #3 = { nounwind } +attributes #4 = { argmemonly nounwind writeonly } + +!llvm.module.flags = !{!0, !1, !2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"uwtable", i32 2} +!2 = !{i32 7, !"frame-pointer", i32 2} diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir new file mode 100644 index 0000000000000..dc79134321e9c --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir @@ -0,0 +1,165 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s + +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: [] +liveins: + - { reg: '$edi', virtual-reg: '' } + - { reg: '$esi', virtual-reg: '' } + - { reg: '$edx', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1024 + adjustsStack: false + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $rdi, $rsi, $rdx, $rax + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $rdi, $rsi, $rdx, $rax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: renamable $rcx = MOV32ri64 64 + ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) + ; CHECK-NEXT: renamable $cx = MOV16ri 64 + ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) + ; CHECK-NEXT: renamable $cx = MOV16ri 16 + ; CHECK-NEXT: renamable $r8w = MOV16ri 16 + ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4) + ; CHECK-NEXT: $al = IMPLICIT_DEF + ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4) + ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) + ; CHECK-NEXT: renamable $r9 = COPY $rsi + ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) + ; CHECK-NEXT: renamable $r8 = COPY $rdi + ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) + ; CHECK-NEXT: renamable $r10 = COPY $rax + ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) + ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg + ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5 + ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 + ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 + ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 + ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx + ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 + ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg + ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg + ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg + ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 + ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 + renamable $zmm0 = AVX512_512_SET0 + VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) + MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + renamable $rcx = MOV32ri64 64 + MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) + renamable $cx = MOV16ri 64 + MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) + renamable $cx = MOV16ri 16 + renamable $r8w = MOV16ri 16 + MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) + PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) + renamable $r9 = COPY $rsi + $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) + renamable $r8 = COPY $rdi + $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) + renamable $r10 = COPY $rax + $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) + renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg + renamable $tmm0 = COPY renamable $tmm5 + renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 + PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 + PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 + renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx + PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 + renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg + renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg + renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg + renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 + PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 +... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir new file mode 100644 index 0000000000000..e62a52162d523 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir @@ -0,0 +1,153 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s + +--- | + @buf = dso_local global [2048 x i8] zeroinitializer, align 16 + @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 + + define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { + entry: + %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5 + %1 = extractvalue { x86_amx, x86_amx } %0, 0 + %2 = extractvalue { x86_amx, x86_amx } %0, 1 + %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 + %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5 + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5 + ret void + } + + declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1 + + declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 + + declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 + + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 + + declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 + + declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4 + + attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } + attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #5 = { nounwind } + +... +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gr32, preferred-register: '' } + - { id: 1, class: gr32, preferred-register: '' } + - { id: 2, class: gr32, preferred-register: '' } + - { id: 3, class: gr16, preferred-register: '' } + - { id: 4, class: gr16, preferred-register: '' } + - { id: 5, class: gr16, preferred-register: '' } + - { id: 6, class: gr64, preferred-register: '' } + - { id: 7, class: gr64_nosp, preferred-register: '' } + - { id: 8, class: tilepair, preferred-register: '' } + - { id: 9, class: tile, preferred-register: '' } + - { id: 10, class: tile, preferred-register: '' } + - { id: 11, class: tile, preferred-register: '' } + - { id: 12, class: tile, preferred-register: '' } + - { id: 13, class: gr64, preferred-register: '' } + - { id: 14, class: vr512, preferred-register: '' } +liveins: + - { reg: '$edi', virtual-reg: '%0' } + - { reg: '$esi', virtual-reg: '%1' } + - { reg: '$edx', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $edi, $esi, $edx + + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $edi, $esi, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4) + ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4) + ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4) + ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf + ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1 + ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2 + ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]] + ; CHECK-NEXT: RET 0 + %2:gr32 = COPY $edx + %1:gr32 = COPY $esi + %0:gr32 = COPY $edi + %14:vr512 = AVX512_512_SET0 + VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4) + MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) + PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) + %6:gr64 = MOV32ri64 @buf + %7:gr64_nosp = MOV32ri64 32 + %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg + %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit + %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1 + %13:gr64 = MOV32ri64 @buf2 + PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12 + RET 0 + +... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir new file mode 100644 index 0000000000000..857ad433af153 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir @@ -0,0 +1,97 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s + +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: [] +liveins: + - { reg: '$edi', virtual-reg: '' } + - { reg: '$esi', virtual-reg: '' } + - { reg: '$edx', virtual-reg: '' } + - { reg: '$cx', virtual-reg: '' } + - { reg: '$r9', virtual-reg: '' } + - { reg: '$r10', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1024 + adjustsStack: false + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 + + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) + ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg + ; CHECK-NEXT: $rax = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3) + ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3) + ; CHECK-NEXT: $rax = MOV64ri 64 + ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2) + ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2) + ; CHECK-NEXT: renamable $r8 = MOV32ri64 64 + ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1) + ; CHECK-NEXT: renamable $di = MOV16ri 64 + ; CHECK-NEXT: renamable $cx = MOV16ri 16 + ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 + ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 + PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4) + renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg + renamable $tmm0 = COPY renamable $tmm5 + renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 + renamable $r8 = MOV32ri64 64 + MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68) + renamable $di = MOV16ri 64 + renamable $cx = MOV16ri 16 + PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 + PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 + +... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll new file mode 100644 index 0000000000000..52641c65c90e9 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py + ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s + + @buf = dso_local global [2048 x i8] zeroinitializer, align 16 + + ; Function Attrs: noinline nounwind optnone uwtable + define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 { +; CHECK-LABEL: @test_tile_2rpntlvwz0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] +; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]]) +; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]]) +; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]] +; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]]) +; CHECK-NEXT: ret void +; + entry: + + %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7 + %1 = extractvalue { x86_amx, x86_amx } %0, 0 + %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7 + store <256 x i32> %2, ptr %m, align 1024 + + %3 = extractvalue { x86_amx, x86_amx } %0, 1 + %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7 + store <256 x i32> %4, ptr %m, align 1024 + + %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7 + %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7 + store <256 x i32> %6, ptr %m, align 64 + + %7 = load <256 x i32>, ptr %m, align 64 + %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7 + %9 = load <256 x i32>, ptr %m, align 64 + %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7 + %11 = load <256 x i32>, ptr %m, align 64 + %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7 + + %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7 + %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7 + store <256 x i32> %14, ptr %m, align 64 + + ret void + } + + ; Function Attrs: argmemonly nounwind readonly + declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2 + + ; Function Attrs: nounwind readnone + declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4 + + ; Function Attrs: nounwind readnone + declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3 + + ; Function Attrs: argmemonly nounwind writeonly + declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5 + + attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } + attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #7 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll new file mode 100644 index 0000000000000..346d46b6b16c2 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s + + @buf = dso_local global [2048 x i8] zeroinitializer, align 16 + @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 + + ; Function Attrs: nounwind uwtable + define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { +; CHECK-LABEL: @test_tile_2rpntlvwz0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] +; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]] +; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]] +; CHECK-NEXT: ret void +; + entry: + %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5 + %1 = extractvalue { x86_amx, x86_amx } %0, 0 + %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5 + %3 = extractvalue { x86_amx, x86_amx } %0, 1 + %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5 + %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 + %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5 + %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5 + %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5 + %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5 + %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5 + %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5 + %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5 + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5 + ret void + } + + ; Function Attrs: argmemonly nounwind readonly + declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 + + ; Function Attrs: nounwind readnone + declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 + + ; Function Attrs: nounwind + declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 + + ; Function Attrs: nounwind readnone + declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 + + ; Function Attrs: argmemonly nounwind writeonly + declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 + + attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } + attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } + attributes #5 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir new file mode 100644 index 0000000000000..cdc525193fef7 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s + +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gr64_nosp, preferred-register: '' } + - { id: 1, class: gr16, preferred-register: '' } + - { id: 2, class: gr16, preferred-register: '' } + - { id: 3, class: gr16, preferred-register: '' } + - { id: 4, class: gr64, preferred-register: '' } + - { id: 5, class: gr64, preferred-register: '' } + - { id: 6, class: gr64, preferred-register: '' } + - { id: 7, class: gr64_nosp, preferred-register: '' } + - { id: 8, class: tilepair, preferred-register: '' } + - { id: 9, class: tile, preferred-register: '' } + - { id: 10, class: tile, preferred-register: '' } + - { id: 11, class: tile, preferred-register: '' } + - { id: 181, class: tile, preferred-register: '' } + - { id: 183, class: tile, preferred-register: '' } + - { id: 185, class: tile, preferred-register: '' } + - { id: 186, class: tile, preferred-register: '' } +liveins: + - { reg: '$edi', virtual-reg: '%0' } + - { reg: '$esi', virtual-reg: '%1' } + - { reg: '$edx', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1024 + adjustsStack: false + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 21, name: '', type: default, offset: 0, size: 8, + alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $rdi, $rsi, $rdx, $rax + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $rdi, $rsi, $rdx, $rax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64 + ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64 + ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16 + ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16 + ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax + ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 + ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]] + ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]] + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]] + ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]] + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]] + ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + %0:gr64_nosp = MOV32ri64 64 + %1:gr16 = MOV16ri 64 + %2:gr16 = MOV16ri 16 + %3:gr16 = MOV16ri 16 + %4:gr64 = COPY $rsi + %5:gr64 = COPY $rdi + %6:gr64 = COPY $rdx + %7:gr64_nosp = COPY $rax + %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg + %9:tile = COPY %8.sub_t1 + %10:tile = COPY %8.sub_t0 + PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10 + PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9 + %11:tile = PTILEZEROV %1, %2 + PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11 + %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg + %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg + %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg + %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185 + PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186 +... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir new file mode 100644 index 0000000000000..a9824dcac6b04 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir @@ -0,0 +1,113 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s + +--- +name: test_tile_2rpntlvwz0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gr32, preferred-register: '' } + - { id: 1, class: gr32, preferred-register: '' } + - { id: 2, class: gr32, preferred-register: '' } + - { id: 3, class: gr16, preferred-register: '' } + - { id: 4, class: gr16, preferred-register: '' } + - { id: 5, class: gr16, preferred-register: '' } + - { id: 6, class: gr64, preferred-register: '' } + - { id: 7, class: gr64_nosp, preferred-register: '' } + - { id: 8, class: tilepair, preferred-register: '' } + - { id: 9, class: tile, preferred-register: '' } + - { id: 10, class: tile, preferred-register: '' } + - { id: 11, class: tile, preferred-register: '' } + - { id: 12, class: tile, preferred-register: '' } + - { id: 13, class: gr64, preferred-register: '' } +liveins: + - { reg: '$edi', virtual-reg: '%0' } + - { reg: '$esi', virtual-reg: '%1' } + - { reg: '$edx', virtual-reg: '%2' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: ManagedRA +body: | + bb.0.entry: + liveins: $edi, $esi, $edx, $rax, $rbx + + ; CHECK-LABEL: name: test_tile_2rpntlvwz0 + ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 + ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit + ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax + ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 + ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]] + ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx + ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] + ; CHECK-NEXT: RET 0 + %2:gr32 = COPY $edx + %1:gr32 = COPY $esi + %0:gr32 = COPY $edi + %3:gr16 = COPY %2.sub_16bit + %4:gr16 = COPY %1.sub_16bit + %5:gr16 = COPY %0.sub_16bit + %6:gr64 = COPY $rax + %7:gr64_nosp = MOV32ri64 32 + %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg + %9:tile = COPY %8.sub_t1 + %10:tile = COPY %8.sub_t0 + %11:tile = PTILEZEROV %5, %4 + %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9 + %13:gr64 = COPY $rbx + PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12 + RET 0 + +... diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll new file mode 100644 index 0000000000000..b06a9369b9762 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll @@ -0,0 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-tile,+amx-bf16,+amx-int8,+amx-transpose | FileCheck %s + +define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 +; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 +; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 +; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 +; CHECK-NEXT: ttransposed %tmm3, %tmm1 +; CHECK-NEXT: retq + call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) + call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride) + call void @llvm.x86.ttransposed(i8 1, i8 3) + ret void +} + +declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride) +declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride) +declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride) +declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride) +declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1) + +define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { +; CHECK-LABEL: test_amx3: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movw $8, %cx +; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: ttransposed %tmm4, %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) + %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) + %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) + %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) + %5 = extractvalue { x86_amx, x86_amx } %4, 0 + %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6) + ret void +} + +define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { +; CHECK-LABEL: test_amx_spill: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 +; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 +; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 +; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 +; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 +; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 +; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) +; CHECK-NEXT: movabsq $64, %rcx +; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) +; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) +; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx) +; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx) +; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) + %e11 = extractvalue { x86_amx, x86_amx } %b1, 0 + %e12 = extractvalue { x86_amx, x86_amx } %b1, 1 + %e21 = extractvalue { x86_amx, x86_amx } %b2, 0 + %e22 = extractvalue { x86_amx, x86_amx } %b2, 1 + %e31 = extractvalue { x86_amx, x86_amx } %b3, 0 + %e32 = extractvalue { x86_amx, x86_amx } %b3, 1 + %e41 = extractvalue { x86_amx, x86_amx } %b4, 0 + %e42 = extractvalue { x86_amx, x86_amx } %b4, 1 + %e51 = extractvalue { x86_amx, x86_amx } %b5, 0 + %e52 = extractvalue { x86_amx, x86_amx } %b5, 1 + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52) + ret void +} + +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64) +declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64) +declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx) + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll index d1b8be15a2d03..9b123b730a214 100644 --- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll +++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll @@ -3,7 +3,7 @@ target triple = "x86_64-unknown-unknown" declare void @bar1() define preserve_allcc void @foo()#0 { -; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void @@ -11,7 +11,7 @@ define preserve_allcc void @foo()#0 { declare void @bar2() define preserve_nonecc void @foo2()#0 { -; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll index 5ae5caf3e88b2..537e05310dbea 100644 --- a/llvm/test/CodeGen/X86/scmp.ll +++ b/llvm/test/CodeGen/X86/scmp.ll @@ -1763,7 +1763,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %rdi, %r14 +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp @@ -1779,11 +1779,11 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: setl %sil ; SSE2-NEXT: setg %dil ; SSE2-NEXT: subb %sil, %dil -; SSE2-NEXT: movsbq %dil, %rax -; SSE2-NEXT: movq %rax, (%r14) -; SSE2-NEXT: movq %rax, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movsbq %dil, %rdi +; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movq %rdi, (%rax) +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: addb %r11b, %r11b ; SSE2-NEXT: sarb %r11b ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi @@ -1793,9 +1793,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: setl %sil ; SSE2-NEXT: setg %r11b ; SSE2-NEXT: subb %sil, %r11b -; SSE2-NEXT: movsbq %r11b, %rdi -; SSE2-NEXT: movq %rdi, %r11 -; SSE2-NEXT: sarq $63, %r11 +; SSE2-NEXT: movsbq %r11b, %r11 +; SSE2-NEXT: movq %r11, %r14 +; SSE2-NEXT: sarq $63, %r14 ; SSE2-NEXT: addb %r12b, %r12b ; SSE2-NEXT: sarb %r12b ; SSE2-NEXT: addb %dl, %dl @@ -1804,18 +1804,18 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: setl %dl ; SSE2-NEXT: setg %sil ; SSE2-NEXT: subb %dl, %sil -; SSE2-NEXT: movsbq %sil, %rdx -; SSE2-NEXT: movq %rdx, %r13 -; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: movsbq %sil, %r13 +; SSE2-NEXT: movq %r13, %rdi +; SSE2-NEXT: sarq $63, %rdi ; SSE2-NEXT: addb %r15b, %r15b ; SSE2-NEXT: sarb %r15b ; SSE2-NEXT: addb %cl, %cl ; SSE2-NEXT: sarb %cl ; SSE2-NEXT: cmpb %r15b, %cl ; SSE2-NEXT: setl %cl -; SSE2-NEXT: setg %sil -; SSE2-NEXT: subb %cl, %sil -; SSE2-NEXT: movsbq %sil, %r15 +; SSE2-NEXT: setg %dl +; SSE2-NEXT: subb %cl, %dl +; SSE2-NEXT: movsbq %dl, %r15 ; SSE2-NEXT: movq %r15, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: addb %bpl, %bpl @@ -1823,9 +1823,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: addb %r8b, %r8b ; SSE2-NEXT: sarb %r8b ; SSE2-NEXT: cmpb %bpl, %r8b -; SSE2-NEXT: setl %sil +; SSE2-NEXT: setl %dl ; SSE2-NEXT: setg %r8b -; SSE2-NEXT: subb %sil, %r8b +; SSE2-NEXT: subb %dl, %r8b ; SSE2-NEXT: movsbq %r8b, %r8 ; SSE2-NEXT: movq %r8, %r12 ; SSE2-NEXT: sarq $63, %r12 @@ -1834,85 +1834,83 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: addb %r9b, %r9b ; SSE2-NEXT: sarb %r9b ; SSE2-NEXT: cmpb %bl, %r9b -; SSE2-NEXT: setl %sil +; SSE2-NEXT: setl %dl ; SSE2-NEXT: setg %r9b -; SSE2-NEXT: subb %sil, %r9b -; SSE2-NEXT: movsbq %r9b, %r9 -; SSE2-NEXT: movq %r9, %rbx -; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: subb %dl, %r9b +; SSE2-NEXT: movsbq %r9b, %rsi +; SSE2-NEXT: movq %rsi, %r9 +; SSE2-NEXT: sarq $63, %r9 ; SSE2-NEXT: addb %r10b, %r10b ; SSE2-NEXT: sarb %r10b -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: addb %sil, %sil -; SSE2-NEXT: sarb %sil -; SSE2-NEXT: cmpb %r10b, %sil -; SSE2-NEXT: setl %sil +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: addb %dl, %dl +; SSE2-NEXT: sarb %dl +; SSE2-NEXT: cmpb %r10b, %dl +; SSE2-NEXT: setl %dl ; SSE2-NEXT: setg %r10b -; SSE2-NEXT: subb %sil, %r10b -; SSE2-NEXT: movsbq %r10b, %rbp -; SSE2-NEXT: movq %rbp, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movq %r10, %rsi -; SSE2-NEXT: shldq $62, %rbp, %rsi -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %rsi, 88(%r14) -; SSE2-NEXT: shrq $2, %r10 -; SSE2-NEXT: movl %r10d, 96(%r14) -; SSE2-NEXT: movq %rbx, %rsi -; SSE2-NEXT: shldq $20, %r9, %rsi -; SSE2-NEXT: movq %rsi, 64(%r14) -; SSE2-NEXT: movq %r12, %rsi -; SSE2-NEXT: shldq $31, %r8, %rsi -; SSE2-NEXT: movq %rsi, 48(%r14) -; SSE2-NEXT: movq %rcx, %rsi -; SSE2-NEXT: shldq $42, %r15, %rsi -; SSE2-NEXT: movabsq $9007199254738944, %rax # imm = 0x1FFFFFFFFFF800 -; SSE2-NEXT: andq %r13, %rax -; SSE2-NEXT: shldq $53, %rdx, %r13 -; SSE2-NEXT: movq %rsi, 32(%r14) -; SSE2-NEXT: movq %r13, 16(%r14) -; SSE2-NEXT: movabsq $9007199254740991, %rsi # imm = 0x1FFFFFFFFFFFFF -; SSE2-NEXT: andq %rsi, %r11 -; SSE2-NEXT: shldq $9, %rdi, %r11 -; SSE2-NEXT: shlq $62, %rbp -; SSE2-NEXT: orq %r11, %rbp -; SSE2-NEXT: movq %rbp, 80(%r14) -; SSE2-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF -; SSE2-NEXT: andq %r10, %r11 -; SSE2-NEXT: movq %r11, %r10 -; SSE2-NEXT: shrq $48, %r10 -; SSE2-NEXT: movb %r10b, 102(%r14) -; SSE2-NEXT: shrq $32, %r11 -; SSE2-NEXT: movw %r11w, 100(%r14) +; SSE2-NEXT: subb %dl, %r10b +; SSE2-NEXT: movsbq %r10b, %r10 +; SSE2-NEXT: movq %r10, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: movl %edx, 96(%rax) +; SSE2-NEXT: movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF +; SSE2-NEXT: andq %rdx, %rbp +; SSE2-NEXT: shldq $62, %r10, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload +; SSE2-NEXT: # xmm0 = mem[0],zero +; SSE2-NEXT: movq %r9, %rbx +; SSE2-NEXT: shldq $20, %rsi, %rbx +; SSE2-NEXT: movq %rdx, 88(%rax) +; SSE2-NEXT: movq %r12, %rdx +; SSE2-NEXT: shldq $31, %r8, %rdx +; SSE2-NEXT: movq %rbx, 64(%rax) +; SSE2-NEXT: movq %rcx, %rbx +; SSE2-NEXT: shldq $42, %r15, %rbx +; SSE2-NEXT: movq %rdx, 48(%rax) +; SSE2-NEXT: movq %rbx, 32(%rax) +; SSE2-NEXT: movabsq $9007199254738944, %rbx # imm = 0x1FFFFFFFFFF800 +; SSE2-NEXT: andq %rdi, %rbx +; SSE2-NEXT: shldq $53, %r13, %rdi +; SSE2-NEXT: movq %rdi, 16(%rax) +; SSE2-NEXT: movq %rbp, %rdx +; SSE2-NEXT: shrq $48, %rdx +; SSE2-NEXT: movb %dl, 102(%rax) +; SSE2-NEXT: shrq $32, %rbp +; SSE2-NEXT: movabsq $9007199254740991, %rdx # imm = 0x1FFFFFFFFFFFFF +; SSE2-NEXT: andq %rdx, %r14 +; SSE2-NEXT: shldq $9, %r11, %r14 +; SSE2-NEXT: movw %bp, 100(%rax) +; SSE2-NEXT: shlq $62, %r10 +; SSE2-NEXT: orq %r14, %r10 +; SSE2-NEXT: movq %r10, 80(%rax) ; SSE2-NEXT: shlq $42, %r15 -; SSE2-NEXT: shrq $11, %rax -; SSE2-NEXT: orq %r15, %rax -; SSE2-NEXT: movq %rax, 24(%r14) -; SSE2-NEXT: shlq $9, %rdi -; SSE2-NEXT: shrq $44, %rbx -; SSE2-NEXT: andl $511, %ebx # imm = 0x1FF -; SSE2-NEXT: orq %rdi, %rbx -; SSE2-NEXT: movq %rbx, 72(%r14) -; SSE2-NEXT: shlq $20, %r9 +; SSE2-NEXT: shrq $11, %rbx +; SSE2-NEXT: orq %r15, %rbx +; SSE2-NEXT: movq %rbx, 24(%rax) +; SSE2-NEXT: shlq $9, %r11 +; SSE2-NEXT: shrq $44, %r9 +; SSE2-NEXT: andl $511, %r9d # imm = 0x1FF +; SSE2-NEXT: orq %r11, %r9 +; SSE2-NEXT: movq %r9, 72(%rax) +; SSE2-NEXT: shlq $20, %rsi ; SSE2-NEXT: shrq $33, %r12 ; SSE2-NEXT: andl $1048575, %r12d # imm = 0xFFFFF -; SSE2-NEXT: orq %r9, %r12 -; SSE2-NEXT: movq %r12, 56(%r14) +; SSE2-NEXT: orq %rsi, %r12 +; SSE2-NEXT: movq %r12, 56(%rax) ; SSE2-NEXT: shlq $31, %r8 ; SSE2-NEXT: shrq $22, %rcx ; SSE2-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF ; SSE2-NEXT: orq %r8, %rcx -; SSE2-NEXT: movq %rcx, 40(%r14) +; SSE2-NEXT: movq %rcx, 40(%rax) ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload ; SSE2-NEXT: # xmm1 = mem[0],zero ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: andq %rsi, %rax -; SSE2-NEXT: shlq $53, %rdx -; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: movq %rdx, 8(%r14) -; SSE2-NEXT: movq %r14, %rax +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: andq %rdx, %rcx +; SSE2-NEXT: shlq $53, %r13 +; SSE2-NEXT: orq %rcx, %r13 +; SSE2-NEXT: movq %r13, 8(%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1929,151 +1927,148 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE4-NEXT: pushq %r13 ; SSE4-NEXT: pushq %r12 ; SSE4-NEXT: pushq %rbx -; SSE4-NEXT: movq %rdi, %r14 +; SSE4-NEXT: movq %rdi, %rbx +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; SSE4-NEXT: addb %dil, %dil -; SSE4-NEXT: sarb %dil +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; SSE4-NEXT: addb %r14b, %r14b +; SSE4-NEXT: sarb %r14b ; SSE4-NEXT: addb %sil, %sil ; SSE4-NEXT: sarb %sil -; SSE4-NEXT: cmpb %dil, %sil +; SSE4-NEXT: cmpb %r14b, %sil ; SSE4-NEXT: setl %sil -; SSE4-NEXT: setg %dil -; SSE4-NEXT: subb %sil, %dil -; SSE4-NEXT: movsbq %dil, %r12 -; SSE4-NEXT: movq %r12, %rdi -; SSE4-NEXT: sarq $63, %rdi -; SSE4-NEXT: addb %r10b, %r10b -; SSE4-NEXT: sarb %r10b +; SSE4-NEXT: setg %r14b +; SSE4-NEXT: subb %sil, %r14b +; SSE4-NEXT: movsbq %r14b, %r14 +; SSE4-NEXT: movq %r14, (%rbx) +; SSE4-NEXT: sarq $63, %r14 +; SSE4-NEXT: addb %r15b, %r15b +; SSE4-NEXT: sarb %r15b ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; SSE4-NEXT: addb %sil, %sil ; SSE4-NEXT: sarb %sil -; SSE4-NEXT: cmpb %r10b, %sil +; SSE4-NEXT: cmpb %r15b, %sil ; SSE4-NEXT: setl %sil -; SSE4-NEXT: setg %r10b -; SSE4-NEXT: subb %sil, %r10b -; SSE4-NEXT: movsbq %r10b, %r10 -; SSE4-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE4-NEXT: sarq $63, %r10 -; SSE4-NEXT: addb %r11b, %r11b -; SSE4-NEXT: sarb %r11b +; SSE4-NEXT: setg %r15b +; SSE4-NEXT: subb %sil, %r15b +; SSE4-NEXT: movsbq %r15b, %r15 +; SSE4-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE4-NEXT: sarq $63, %r15 +; SSE4-NEXT: addb %bpl, %bpl +; SSE4-NEXT: sarb %bpl ; SSE4-NEXT: addb %dl, %dl ; SSE4-NEXT: sarb %dl -; SSE4-NEXT: cmpb %r11b, %dl +; SSE4-NEXT: cmpb %bpl, %dl ; SSE4-NEXT: setl %dl -; SSE4-NEXT: setg %r11b -; SSE4-NEXT: subb %dl, %r11b -; SSE4-NEXT: movsbq %r11b, %r11 -; SSE4-NEXT: movq %r11, %rsi -; SSE4-NEXT: sarq $63, %rsi -; SSE4-NEXT: addb %bl, %bl -; SSE4-NEXT: sarb %bl +; SSE4-NEXT: setg %bpl +; SSE4-NEXT: subb %dl, %bpl +; SSE4-NEXT: movsbq %bpl, %r12 +; SSE4-NEXT: movq %r12, %r13 +; SSE4-NEXT: sarq $63, %r13 +; SSE4-NEXT: addb %al, %al +; SSE4-NEXT: sarb %al ; SSE4-NEXT: addb %cl, %cl ; SSE4-NEXT: sarb %cl -; SSE4-NEXT: cmpb %bl, %cl +; SSE4-NEXT: cmpb %al, %cl ; SSE4-NEXT: setl %cl ; SSE4-NEXT: setg %dl ; SSE4-NEXT: subb %cl, %dl -; SSE4-NEXT: movsbq %dl, %rbx -; SSE4-NEXT: movq %rbx, %rcx +; SSE4-NEXT: movsbq %dl, %rsi +; SSE4-NEXT: movq %rsi, %rcx ; SSE4-NEXT: sarq $63, %rcx -; SSE4-NEXT: addb %r13b, %r13b -; SSE4-NEXT: sarb %r13b +; SSE4-NEXT: addb %r11b, %r11b +; SSE4-NEXT: sarb %r11b ; SSE4-NEXT: addb %r8b, %r8b ; SSE4-NEXT: sarb %r8b -; SSE4-NEXT: cmpb %r13b, %r8b +; SSE4-NEXT: cmpb %r11b, %r8b ; SSE4-NEXT: setl %dl ; SSE4-NEXT: setg %r8b ; SSE4-NEXT: subb %dl, %r8b ; SSE4-NEXT: movsbq %r8b, %rdx ; SSE4-NEXT: movq %rdx, %r8 ; SSE4-NEXT: sarq $63, %r8 -; SSE4-NEXT: addb %r15b, %r15b -; SSE4-NEXT: sarb %r15b +; SSE4-NEXT: addb %r10b, %r10b +; SSE4-NEXT: sarb %r10b ; SSE4-NEXT: addb %r9b, %r9b ; SSE4-NEXT: sarb %r9b -; SSE4-NEXT: cmpb %r15b, %r9b +; SSE4-NEXT: cmpb %r10b, %r9b ; SSE4-NEXT: setl %r9b -; SSE4-NEXT: setg %r15b -; SSE4-NEXT: subb %r9b, %r15b -; SSE4-NEXT: movsbq %r15b, %r9 -; SSE4-NEXT: movq %r9, %r15 -; SSE4-NEXT: sarq $63, %r15 -; SSE4-NEXT: addb %bpl, %bpl -; SSE4-NEXT: sarb %bpl -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d -; SSE4-NEXT: addb %r13b, %r13b -; SSE4-NEXT: sarb %r13b -; SSE4-NEXT: cmpb %bpl, %r13b -; SSE4-NEXT: setl %bpl -; SSE4-NEXT: setg %r13b -; SSE4-NEXT: subb %bpl, %r13b -; SSE4-NEXT: movsbq %r13b, %rbp +; SSE4-NEXT: setg %r10b +; SSE4-NEXT: subb %r9b, %r10b +; SSE4-NEXT: movsbq %r10b, %r9 +; SSE4-NEXT: movq %r9, %r10 +; SSE4-NEXT: sarq $63, %r10 +; SSE4-NEXT: addb %dil, %dil +; SSE4-NEXT: sarb %dil +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; SSE4-NEXT: addb %r11b, %r11b +; SSE4-NEXT: sarb %r11b +; SSE4-NEXT: cmpb %dil, %r11b +; SSE4-NEXT: setl %dil +; SSE4-NEXT: setg %r11b +; SSE4-NEXT: subb %dil, %r11b +; SSE4-NEXT: movsbq %r11b, %rdi +; SSE4-NEXT: movq %rdi, %rbp +; SSE4-NEXT: sarq $63, %rbp +; SSE4-NEXT: movl %ebp, 96(%rbx) ; SSE4-NEXT: movq %rbp, %rax -; SSE4-NEXT: sarq $63, %rax -; SSE4-NEXT: movq %rax, %r13 -; SSE4-NEXT: shldq $62, %rbp, %r13 -; SSE4-NEXT: movq %r12, (%r14) -; SSE4-NEXT: movq %r13, 88(%r14) -; SSE4-NEXT: shrq $2, %rax -; SSE4-NEXT: movl %eax, 96(%r14) -; SSE4-NEXT: movq %r15, %r12 -; SSE4-NEXT: shldq $20, %r9, %r12 -; SSE4-NEXT: movq %r12, 64(%r14) -; SSE4-NEXT: movq %r8, %r12 -; SSE4-NEXT: shldq $31, %rdx, %r12 -; SSE4-NEXT: movq %r12, 48(%r14) -; SSE4-NEXT: movq %rcx, %r12 -; SSE4-NEXT: shldq $42, %rbx, %r12 -; SSE4-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800 -; SSE4-NEXT: andq %rsi, %r13 -; SSE4-NEXT: shldq $53, %r11, %rsi -; SSE4-NEXT: movq %r12, 32(%r14) -; SSE4-NEXT: movq %rsi, 16(%r14) -; SSE4-NEXT: movabsq $9007199254740991, %rsi # imm = 0x1FFFFFFFFFFFFF -; SSE4-NEXT: andq %rsi, %r10 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE4-NEXT: shldq $9, %r12, %r10 -; SSE4-NEXT: shlq $62, %rbp -; SSE4-NEXT: orq %r10, %rbp -; SSE4-NEXT: movq %rbp, 80(%r14) -; SSE4-NEXT: andq %rsi, %rdi -; SSE4-NEXT: shlq $53, %r11 -; SSE4-NEXT: orq %rdi, %r11 -; SSE4-NEXT: movq %r11, 8(%r14) -; SSE4-NEXT: movabsq $2251799813685247, %rsi # imm = 0x7FFFFFFFFFFFF -; SSE4-NEXT: andq %rax, %rsi -; SSE4-NEXT: movq %rsi, %rax -; SSE4-NEXT: shrq $48, %rax -; SSE4-NEXT: movb %al, 102(%r14) -; SSE4-NEXT: shrq $32, %rsi -; SSE4-NEXT: movw %si, 100(%r14) -; SSE4-NEXT: shlq $42, %rbx -; SSE4-NEXT: shrq $11, %r13 -; SSE4-NEXT: orq %rbx, %r13 -; SSE4-NEXT: movq %r13, 24(%r14) -; SSE4-NEXT: movq %r12, %rax -; SSE4-NEXT: shlq $9, %rax -; SSE4-NEXT: shrq $44, %r15 -; SSE4-NEXT: andl $511, %r15d # imm = 0x1FF -; SSE4-NEXT: orq %rax, %r15 -; SSE4-NEXT: movq %r15, 72(%r14) +; SSE4-NEXT: shldq $62, %rdi, %rax +; SSE4-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF +; SSE4-NEXT: andq %rbp, %r11 +; SSE4-NEXT: movq %r10, %rbp +; SSE4-NEXT: shldq $20, %r9, %rbp +; SSE4-NEXT: movq %rax, 88(%rbx) +; SSE4-NEXT: movq %r8, %rax +; SSE4-NEXT: shldq $31, %rdx, %rax +; SSE4-NEXT: movq %rbp, 64(%rbx) +; SSE4-NEXT: movq %rcx, %rbp +; SSE4-NEXT: shldq $42, %rsi, %rbp +; SSE4-NEXT: movq %rax, 48(%rbx) +; SSE4-NEXT: movq %rbp, 32(%rbx) +; SSE4-NEXT: movabsq $9007199254738944, %rax # imm = 0x1FFFFFFFFFF800 +; SSE4-NEXT: andq %r13, %rax +; SSE4-NEXT: shldq $53, %r12, %r13 +; SSE4-NEXT: movq %r13, 16(%rbx) +; SSE4-NEXT: movq %r11, %r13 +; SSE4-NEXT: shrq $48, %r13 +; SSE4-NEXT: movb %r13b, 102(%rbx) +; SSE4-NEXT: shrq $32, %r11 +; SSE4-NEXT: movabsq $9007199254740991, %r13 # imm = 0x1FFFFFFFFFFFFF +; SSE4-NEXT: andq %r13, %r15 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; SSE4-NEXT: shldq $9, %rbp, %r15 +; SSE4-NEXT: movw %r11w, 100(%rbx) +; SSE4-NEXT: shlq $62, %rdi +; SSE4-NEXT: orq %r15, %rdi +; SSE4-NEXT: movq %rdi, 80(%rbx) +; SSE4-NEXT: andq %r13, %r14 +; SSE4-NEXT: shlq $53, %r12 +; SSE4-NEXT: orq %r14, %r12 +; SSE4-NEXT: movq %r12, 8(%rbx) +; SSE4-NEXT: shlq $42, %rsi +; SSE4-NEXT: shrq $11, %rax +; SSE4-NEXT: orq %rsi, %rax +; SSE4-NEXT: movq %rax, 24(%rbx) +; SSE4-NEXT: shlq $9, %rbp +; SSE4-NEXT: shrq $44, %r10 +; SSE4-NEXT: andl $511, %r10d # imm = 0x1FF +; SSE4-NEXT: orq %rbp, %r10 +; SSE4-NEXT: movq %r10, 72(%rbx) ; SSE4-NEXT: shlq $20, %r9 ; SSE4-NEXT: shrq $33, %r8 ; SSE4-NEXT: andl $1048575, %r8d # imm = 0xFFFFF ; SSE4-NEXT: orq %r9, %r8 -; SSE4-NEXT: movq %r8, 56(%r14) +; SSE4-NEXT: movq %r8, 56(%rbx) ; SSE4-NEXT: shlq $31, %rdx ; SSE4-NEXT: shrq $22, %rcx ; SSE4-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF ; SSE4-NEXT: orq %rdx, %rcx -; SSE4-NEXT: movq %rcx, 40(%r14) -; SSE4-NEXT: movq %r14, %rax +; SSE4-NEXT: movq %rcx, 40(%rbx) +; SSE4-NEXT: movq %rbx, %rax ; SSE4-NEXT: popq %rbx ; SSE4-NEXT: popq %r12 ; SSE4-NEXT: popq %r13 @@ -2174,14 +2169,14 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: setl %dil ; AVX-NEXT: setg %r11b ; AVX-NEXT: subb %dil, %r11b -; AVX-NEXT: movsbq %r11b, %r11 -; AVX-NEXT: movq %r11, %rdi -; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: movsbq %r11b, %rdi ; AVX-NEXT: movq %rdi, %rbp -; AVX-NEXT: shldq $62, %r11, %rbp +; AVX-NEXT: sarq $63, %rbp +; AVX-NEXT: movl %ebp, 96(%rax) +; AVX-NEXT: movb $51, %r11b +; AVX-NEXT: bzhiq %r11, %rbp, %r11 +; AVX-NEXT: shldq $62, %rdi, %rbp ; AVX-NEXT: movq %rbp, 88(%rax) -; AVX-NEXT: shrq $2, %rdi -; AVX-NEXT: movl %edi, 96(%rax) ; AVX-NEXT: movq %r10, %rbp ; AVX-NEXT: shldq $20, %r9, %rbp ; AVX-NEXT: movq %rbp, 64(%rax) @@ -2195,23 +2190,21 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: bzhiq %rbp, %r13, %rbp ; AVX-NEXT: shldq $53, %r12, %r13 ; AVX-NEXT: movq %r13, 16(%rax) -; AVX-NEXT: movb $53, %r13b -; AVX-NEXT: bzhiq %r13, %r15, %r15 +; AVX-NEXT: movq %r11, %r13 +; AVX-NEXT: shrq $48, %r13 +; AVX-NEXT: movb %r13b, 102(%rax) +; AVX-NEXT: shrq $32, %r11 +; AVX-NEXT: movw %r11w, 100(%rax) +; AVX-NEXT: movb $53, %r11b +; AVX-NEXT: bzhiq %r11, %r15, %r15 ; AVX-NEXT: shldq $9, %rsi, %r15 -; AVX-NEXT: shlq $62, %r11 -; AVX-NEXT: orq %r15, %r11 -; AVX-NEXT: movq %r11, 80(%rax) -; AVX-NEXT: bzhiq %r13, %r14, %r11 +; AVX-NEXT: shlq $62, %rdi +; AVX-NEXT: orq %r15, %rdi +; AVX-NEXT: movq %rdi, 80(%rax) +; AVX-NEXT: bzhiq %r11, %r14, %rdi ; AVX-NEXT: shlq $53, %r12 -; AVX-NEXT: orq %r11, %r12 +; AVX-NEXT: orq %rdi, %r12 ; AVX-NEXT: movq %r12, 8(%rax) -; AVX-NEXT: movb $51, %r11b -; AVX-NEXT: bzhiq %r11, %rdi, %rdi -; AVX-NEXT: movq %rdi, %r11 -; AVX-NEXT: shrq $48, %r11 -; AVX-NEXT: movb %r11b, 102(%rax) -; AVX-NEXT: shrq $32, %rdi -; AVX-NEXT: movw %di, 100(%rax) ; AVX-NEXT: shlq $42, %rbx ; AVX-NEXT: shrq $11, %rbp ; AVX-NEXT: orq %rbx, %rbp @@ -2270,24 +2263,24 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addb %dl, %dl -; X86-NEXT: sarb %dl -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: addb %ah, %ah -; X86-NEXT: sarb %ah -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addb %cl, %cl -; X86-NEXT: sarb %cl -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: addb %ch, %ch -; X86-NEXT: sarb %ch ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: addb %bl, %bl ; X86-NEXT: sarb %bl ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: addb %bh, %bh ; X86-NEXT: sarb %bh +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addb %dl, %dl +; X86-NEXT: sarb %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: addb %ch, %ch +; X86-NEXT: sarb %ch +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: addb %cl, %cl +; X86-NEXT: sarb %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: addb %ah, %ah +; X86-NEXT: sarb %ah ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al @@ -2304,140 +2297,136 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %bl, %bh +; X86-NEXT: cmpb %cl, %ah ; X86-NEXT: setl %al -; X86-NEXT: setg %dh -; X86-NEXT: subb %al, %dh -; X86-NEXT: movsbl %dh, %esi +; X86-NEXT: setg %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movsbl %cl, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %cl, %ch +; X86-NEXT: andl $2097151, %eax # imm = 0x1FFFFF +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %dl, %ch ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl -; X86-NEXT: movsbl %cl, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ecx, (%ebp) -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %dl, %ah -; X86-NEXT: setl %al +; X86-NEXT: movsbl %cl, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $2097151, %ecx # imm = 0x1FFFFF +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %bl, %bh +; X86-NEXT: setl %cl ; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl +; X86-NEXT: subb %cl, %dl ; X86-NEXT: movsbl %dl, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: setl %al -; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl -; X86-NEXT: movsbl %dl, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: setl %al +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: setl %cl ; X86-NEXT: setg %dl -; X86-NEXT: subb %al, %dl -; X86-NEXT: movsbl %dl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload -; X86-NEXT: setl %dl -; X86-NEXT: setg %dh -; X86-NEXT: subb %dl, %dh -; X86-NEXT: movsbl %dh, %edx +; X86-NEXT: subb %cl, %dl +; X86-NEXT: movsbl %dl, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, 96(%ebp) -; X86-NEXT: movl %edx, 92(%ebp) -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, 80(%ebp) -; X86-NEXT: movl %eax, 68(%ebp) -; X86-NEXT: movl %eax, 64(%ebp) -; X86-NEXT: movl %esi, 52(%ebp) -; X86-NEXT: movl %esi, 48(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, 36(%ebp) -; X86-NEXT: movl %edi, 24(%ebp) -; X86-NEXT: movl %edi, 20(%ebp) -; X86-NEXT: movl %ecx, 8(%ebp) -; X86-NEXT: movl %ecx, 4(%ebp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $30, %edx, %ecx -; X86-NEXT: movl %ecx, 88(%ebp) -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: setl %cl +; X86-NEXT: setg %ch +; X86-NEXT: subb %cl, %ch +; X86-NEXT: movsbl %ch, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: setl %cl +; X86-NEXT: setg %ch +; X86-NEXT: subb %cl, %ch +; X86-NEXT: movsbl %ch, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, 96(%ecx) +; X86-NEXT: movl %esi, 92(%ecx) +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ebp, 80(%ecx) +; X86-NEXT: movl %ebx, 68(%ecx) +; X86-NEXT: movl %ebx, 64(%ecx) +; X86-NEXT: movl %edx, 52(%ecx) +; X86-NEXT: movl %edx, 48(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $9, %ebp, %ecx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: movl %ecx, 76(%ebx) -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $20, %ebx, %ecx -; X86-NEXT: movl %ecx, 60(%ebp) -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $31, %ebx, %ecx -; X86-NEXT: movl %ecx, 44(%ebp) -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl %ebp, 36(%ecx) +; X86-NEXT: movl %edi, 24(%ecx) +; X86-NEXT: movl %edi, 20(%ecx) +; X86-NEXT: movl %eax, 8(%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 100(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl $30, %ecx, %eax +; X86-NEXT: movl %eax, 88(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $9, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $9, %ebp, %eax +; X86-NEXT: movl %eax, 76(%esi) +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $10, %ebp, %ecx -; X86-NEXT: movl %ecx, 32(%ebx) -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: shldl $20, %ebp, %eax +; X86-NEXT: movl %eax, 60(%esi) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $31, %ebp, %eax +; X86-NEXT: movl %eax, 44(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $10, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $21, %ebp, %ecx -; X86-NEXT: movl %ecx, 16(%ebx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: shrl $2, %ecx -; X86-NEXT: movw %cx, 100(%ebx) +; X86-NEXT: shldl $10, %ebp, %eax +; X86-NEXT: movl %eax, 32(%esi) +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $21, %ebp, %eax +; X86-NEXT: movl %eax, 16(%esi) ; X86-NEXT: shll $21, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, 12(%ebx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shll $30, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, 84(%ebx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shll $9, %ecx -; X86-NEXT: shrl $12, %eax -; X86-NEXT: andl $511, %eax # imm = 0x1FF -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl %eax, 72(%ebx) +; X86-NEXT: movl %ebp, 12(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: andl $7, %eax +; X86-NEXT: movb %al, 102(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $30, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, 84(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $9, %eax +; X86-NEXT: shrl $12, %ebx +; X86-NEXT: andl $511, %ebx # imm = 0x1FF +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: movl %ebx, 72(%esi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $20, %eax -; X86-NEXT: shrl %esi -; X86-NEXT: andl $1048575, %esi # imm = 0xFFFFF -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, 56(%ebx) +; X86-NEXT: shrl %edx +; X86-NEXT: andl $1048575, %edx # imm = 0xFFFFF +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, 56(%esi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $31, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, 40(%ebx) +; X86-NEXT: movl %eax, 40(%esi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $10, %eax ; X86-NEXT: shrl $11, %edi ; X86-NEXT: andl $1023, %edi # imm = 0x3FF ; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, 28(%ebx) -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl $18, %eax -; X86-NEXT: andl $7, %eax -; X86-NEXT: movb %al, 102(%ebx) -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, 28(%esi) +; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-6.ll b/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-6.ll new file mode 100644 index 0000000000000..8c8eef68b2ec0 --- /dev/null +++ b/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-6.ll @@ -0,0 +1,92 @@ +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=4 --basic-block-sections=none -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=4 --basic-block-sections=all -filetype=obj -o - | llvm-dwarfdump - | FileCheck --check-prefix=SECTIONS %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=5 --basic-block-sections=none -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=5 --basic-block-sections=all -filetype=obj -o - | llvm-dwarfdump - | FileCheck --check-prefix=SECTIONS %s + +; CHECK: DW_TAG_variable +; CHECK-NEXT: DW_AT_location +; CHECK-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_consts +7, DW_OP_stack_value +; CHECK-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_consts +8, DW_OP_stack_value +; CHECK-NEXT: DW_AT_name ("i") + +; SECTIONS: DW_TAG_variable +; SECTIONS-NEXT: DW_AT_location +; SECTIONS-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_consts +7, DW_OP_stack_value +; SECTIONS-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_consts +8, DW_OP_stack_value +; SECTIONS-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_consts +8, DW_OP_stack_value +; SECTIONS-NEXT: [0x{{[0-9a-f]+}}, 0x{{[0-9a-f]+}}): DW_OP_consts +8, DW_OP_stack_value +; SECTIONS-NEXT: DW_AT_name ("i") + +; Source to generate the IR below: +; void f1(); +; extern bool b; +; void test() { +; // i is not a const throughout the whole scope and should +; // not use DW_AT_const_value +; int i = 7; +; f1(); +; i = 8; +; if (b) +; f1(); +; } +; $ clang++ -S loclist_section.cc -O2 -g -emit-llvm + +@b = external local_unnamed_addr global i8, align 1 + +; Function Attrs: mustprogress uwtable +define dso_local void @_Z4testv() local_unnamed_addr #0 !dbg !10 { +entry: + #dbg_value(i32 7, !14, !DIExpression(), !16) + tail call void @_Z2f1v(), !dbg !17 + #dbg_value(i32 8, !14, !DIExpression(), !16) + %0 = load i8, ptr @b, align 1, !dbg !18, !tbaa !20, !range !24, !noundef !25 + %loadedv = trunc nuw i8 %0 to i1, !dbg !18 + br i1 %loadedv, label %if.then, label %if.end, !dbg !26 + +if.then: ; preds = %entry + tail call void @_Z2f1v(), !dbg !27 + br label %if.end, !dbg !27 + +if.end: ; preds = %if.then, %entry + ret void, !dbg !28 +} + +declare !dbg !29 void @_Z2f1v() local_unnamed_addr #1 + +attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} +!llvm.ident = !{!9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (git@github.com:)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "loclist_section.cc", directory: "Examples/debug_loc", checksumkind: CSK_MD5, checksum: "67769a94389681c8a6da481e2f358abb") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!9 = !{!"clang version 20.0.0git (git@github.com:.../llvm-project.git 7c3256280a78b0505ae4d43985c4d3239451a151)"} +!10 = distinct !DISubprogram(name: "test", linkageName: "_Z4testv", scope: !1, file: !1, line: 3, type: !11, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13) +!11 = !DISubroutineType(types: !12) +!12 = !{null} +!13 = !{!14} +!14 = !DILocalVariable(name: "i", scope: !10, file: !1, line: 6, type: !15) +!15 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!16 = !DILocation(line: 0, scope: !10) +!17 = !DILocation(line: 7, column: 5, scope: !10) +!18 = !DILocation(line: 9, column: 9, scope: !19) +!19 = distinct !DILexicalBlock(scope: !10, file: !1, line: 9, column: 9) +!20 = !{!21, !21, i64 0} +!21 = !{!"bool", !22, i64 0} +!22 = !{!"omnipotent char", !23, i64 0} +!23 = !{!"Simple C++ TBAA"} +!24 = !{i8 0, i8 2} +!25 = !{} +!26 = !DILocation(line: 9, column: 9, scope: !10) +!27 = !DILocation(line: 10, column: 7, scope: !19) +!28 = !DILocation(line: 11, column: 1, scope: !10) +!29 = !DISubprogram(name: "f1", linkageName: "_Z2f1v", scope: !1, file: !1, line: 1, type: !11, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) diff --git a/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-7.ll b/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-7.ll new file mode 100644 index 0000000000000..1afef53c45454 --- /dev/null +++ b/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-7.ll @@ -0,0 +1,129 @@ +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=4 --basic-block-sections=none -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=4 --basic-block-sections=all -filetype=obj -o - | llvm-dwarfdump - | FileCheck --check-prefix=SECTIONS %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=5 --basic-block-sections=none -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=5 --basic-block-sections=all -filetype=obj -o - | llvm-dwarfdump - | FileCheck --check-prefix=SECTIONS %s + +; CHECK: DW_TAG_lexical_block +; CHECK-NEXT: DW_AT_low_pc +; CHECK-NEXT: DW_AT_high_pc +; CHECK: DW_TAG_variable +; CHECK-NEXT: DW_AT_const_value (7) +; CHECK-NEXT: DW_AT_name ("i") + +; SECTIONS: DW_TAG_lexical_block +; SECTIONS-NEXT: DW_AT_ranges +; SECTIONS: DW_TAG_variable +; SECTIONS-NEXT: DW_AT_const_value (7) +; SECTIONS-NEXT: DW_AT_name ("i") + +; Test to check that a variable declared within a scope that has basic block +; sections still produces DW_AT_const_value. +; Source to generate the IR below: + +; void f1(int *); +; extern bool b; +; int test() { +; // i is const throughout the whole scope and should +; // use DW_AT_const_value. The scope creates basic +; // block sections and should use DW_AT_ranges. +; int j = 10; +; { +; int i = 7; +; f1(&j); +; if (b) +; f1(&j); +; } +; return j; +; } +; +; clang++ -S scoped_section_const.cc -g -O2 -emit-llvm + +@b = external local_unnamed_addr global i8, align 1 + +; Function Attrs: mustprogress uwtable +define dso_local noundef i32 @_Z4testv() local_unnamed_addr #0 !dbg !9 { + %1 = alloca i32, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %1) #4, !dbg !17 + call void @llvm.dbg.value(metadata i32 10, metadata !14, metadata !DIExpression()), !dbg !18 + store i32 10, ptr %1, align 4, !dbg !19, !tbaa !20 + call void @llvm.dbg.value(metadata i32 7, metadata !15, metadata !DIExpression()), !dbg !24 + call void @llvm.dbg.value(metadata ptr %1, metadata !14, metadata !DIExpression(DW_OP_deref)), !dbg !18 + call void @_Z2f1Pi(ptr noundef nonnull %1), !dbg !25 + %2 = load i8, ptr @b, align 1, !dbg !26, !tbaa !28, !range !30, !noundef !31 + %3 = icmp eq i8 %2, 0, !dbg !26 + br i1 %3, label %5, label %4, !dbg !32 + +4: ; preds = %0 + call void @llvm.dbg.value(metadata ptr %1, metadata !14, metadata !DIExpression(DW_OP_deref)), !dbg !18 + call void @_Z2f1Pi(ptr noundef nonnull %1), !dbg !33 + br label %5, !dbg !33 + +5: ; preds = %4, %0 + %6 = load i32, ptr %1, align 4, !dbg !34, !tbaa !20 + call void @llvm.dbg.value(metadata i32 %6, metadata !14, metadata !DIExpression()), !dbg !18 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %1) #4, !dbg !35 + ret i32 %6, !dbg !36 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +declare !dbg !37 void @_Z2f1Pi(ptr noundef) local_unnamed_addr #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.value(metadata, metadata, metadata) #3 + +attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7} +!llvm.ident = !{!8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "Debian clang version 16.0.6 (26)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "scoped_section_const.cc", directory: "", checksumkind: CSK_MD5, checksum: "0406492d2e2e38af35d9ea210ba1f24b") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{!"Debian clang version 16.0.6 (26)"} +!9 = distinct !DISubprogram(name: "test", linkageName: "_Z4testv", scope: !1, file: !1, line: 3, type: !10, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13) +!10 = !DISubroutineType(types: !11) +!11 = !{!12} +!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!13 = !{!14, !15} +!14 = !DILocalVariable(name: "j", scope: !9, file: !1, line: 6, type: !12) +!15 = !DILocalVariable(name: "i", scope: !16, file: !1, line: 8, type: !12) +!16 = distinct !DILexicalBlock(scope: !9, file: !1, line: 7, column: 5) +!17 = !DILocation(line: 6, column: 5, scope: !9) +!18 = !DILocation(line: 0, scope: !9) +!19 = !DILocation(line: 6, column: 9, scope: !9) +!20 = !{!21, !21, i64 0} +!21 = !{!"int", !22, i64 0} +!22 = !{!"omnipotent char", !23, i64 0} +!23 = !{!"Simple C++ TBAA"} +!24 = !DILocation(line: 0, scope: !16) +!25 = !DILocation(line: 9, column: 7, scope: !16) +!26 = !DILocation(line: 10, column: 11, scope: !27) +!27 = distinct !DILexicalBlock(scope: !16, file: !1, line: 10, column: 11) +!28 = !{!29, !29, i64 0} +!29 = !{!"bool", !22, i64 0} +!30 = !{i8 0, i8 2} +!31 = !{} +!32 = !DILocation(line: 10, column: 11, scope: !16) +!33 = !DILocation(line: 11, column: 9, scope: !27) +!34 = !DILocation(line: 13, column: 12, scope: !9) +!35 = !DILocation(line: 14, column: 1, scope: !9) +!36 = !DILocation(line: 13, column: 5, scope: !9) +!37 = !DISubprogram(name: "f1", linkageName: "_Z2f1Pi", scope: !1, file: !1, line: 1, type: !38, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !31) +!38 = !DISubroutineType(types: !39) +!39 = !{null, !40} +!40 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64) diff --git a/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-8.ll b/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-8.ll new file mode 100644 index 0000000000000..d1af3c2c3ef3e --- /dev/null +++ b/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-8.ll @@ -0,0 +1,107 @@ +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=4 --basic-block-sections=none -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=4 --basic-block-sections=all -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=5 --basic-block-sections=none -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=5 --basic-block-sections=all -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s + +; CHECK: DW_TAG_variable +; CHECK: DW_TAG_variable +; CHECK-NEXT: DW_AT_location +; CHECK-NEXT: DW_OP_consts +0, DW_OP_stack_value +; CHECK-NEXT: DW_OP_consts +7, DW_OP_stack_value +; CHECK-NEXT: DW_OP_consts +8, DW_OP_stack_value +; CHECK: DW_AT_name ("i") + +; void f1(int *); +; void f2(int); +; extern bool b; +; int test() { +; // i is not a const throughout the whole scope and +; // should *not* use DW_AT_const_value. +; int i = 0; +; int j = 10; +; { +; i = 7; +; f1(&j); +; } +; i = 8; +; f2(i); +; return j; +; } +; clang++ -S scoped_section.cc -g -O2 -emit-llvm + +; Function Attrs: mustprogress uwtable +define dso_local noundef i32 @_Z4testv() local_unnamed_addr #0 !dbg !10 { +entry: + %j = alloca i32, align 4, !DIAssignID !17 + #dbg_assign(i1 undef, !16, !DIExpression(), !17, ptr %j, !DIExpression(), !18) + #dbg_value(i32 0, !15, !DIExpression(), !18) + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %j) #3, !dbg !19 + store i32 10, ptr %j, align 4, !dbg !20, !tbaa !21, !DIAssignID !25 + #dbg_assign(i32 10, !16, !DIExpression(), !25, ptr %j, !DIExpression(), !18) + #dbg_value(i32 7, !15, !DIExpression(), !18) + call void @_Z2f1Pi(ptr noundef nonnull %j), !dbg !26 + #dbg_value(i32 8, !15, !DIExpression(), !18) + call void @_Z2f2i(i32 noundef 8), !dbg !28 + %0 = load i32, ptr %j, align 4, !dbg !29, !tbaa !21 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %j) #3, !dbg !30 + ret i32 %0, !dbg !31 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +declare !dbg !32 void @_Z2f1Pi(ptr noundef) local_unnamed_addr #2 + +declare !dbg !36 void @_Z2f2i(i32 noundef) local_unnamed_addr #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} +!llvm.ident = !{!9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (git@github.com:tmsri/llvm-project.git 11a50269e82b6dce49249c5cbe3a989b06f0848f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "scoped_section.cc", directory: "", checksumkind: CSK_MD5, checksum: "2d5675e292541e4f04eb60edf76b14d6") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!9 = !{!"clang version 20.0.0git (git@github.com:tmsri/llvm-project.git 11a50269e82b6dce49249c5cbe3a989b06f0848f)"} +!10 = distinct !DISubprogram(name: "test", linkageName: "_Z4testv", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14) +!11 = !DISubroutineType(types: !12) +!12 = !{!13} +!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!14 = !{!15, !16} +!15 = !DILocalVariable(name: "i", scope: !10, file: !1, line: 7, type: !13) +!16 = !DILocalVariable(name: "j", scope: !10, file: !1, line: 8, type: !13) +!17 = distinct !DIAssignID() +!18 = !DILocation(line: 0, scope: !10) +!19 = !DILocation(line: 8, column: 5, scope: !10) +!20 = !DILocation(line: 8, column: 9, scope: !10) +!21 = !{!22, !22, i64 0} +!22 = !{!"int", !23, i64 0} +!23 = !{!"omnipotent char", !24, i64 0} +!24 = !{!"Simple C++ TBAA"} +!25 = distinct !DIAssignID() +!26 = !DILocation(line: 11, column: 7, scope: !27) +!27 = distinct !DILexicalBlock(scope: !10, file: !1, line: 9, column: 5) +!28 = !DILocation(line: 14, column: 5, scope: !10) +!29 = !DILocation(line: 15, column: 12, scope: !10) +!30 = !DILocation(line: 16, column: 1, scope: !10) +!31 = !DILocation(line: 15, column: 5, scope: !10) +!32 = !DISubprogram(name: "f1", linkageName: "_Z2f1Pi", scope: !1, file: !1, line: 1, type: !33, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!33 = !DISubroutineType(types: !34) +!34 = !{null, !35} +!35 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64) +!36 = !DISubprogram(name: "f2", linkageName: "_Z2f2i", scope: !1, file: !1, line: 2, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!37 = !DISubroutineType(types: !38) +!38 = !{null, !13} \ No newline at end of file diff --git a/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-9.ll b/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-9.ll new file mode 100644 index 0000000000000..a60fd106745b9 --- /dev/null +++ b/llvm/test/DebugInfo/X86/basic-block-sections-debug-loclist-9.ll @@ -0,0 +1,142 @@ +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=4 --basic-block-sections=none -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=4 --basic-block-sections=all -filetype=obj -o - | llvm-dwarfdump - | FileCheck --check-prefix=SECTIONS %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=5 --basic-block-sections=none -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s +; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu --dwarf-version=5 --basic-block-sections=all -filetype=obj -o - | llvm-dwarfdump - | FileCheck --check-prefix=SECTIONS %s + +; CHECK: DW_TAG_subprogram +; CHECK-NEXT: DW_AT_low_pc +; CHECK-NEXT: DW_AT_high_pc +; CHECK: DW_TAG_variable +; CHECK: DW_TAG_variable +; CHECK-NEXT: DW_AT_location +; CHECK-NEXT: DW_OP_consts +0, DW_OP_stack_value +; CHECK-NEXT: DW_OP_consts +7, DW_OP_stack_value +; CHECK-NEXT: DW_OP_consts +8, DW_OP_stack_value +; CHECK-NEXT: DW_AT_name ("i") + +; SECTIONS: DW_TAG_subprogram +; SECTIONS-NEXT: DW_AT_ranges +; SECTIONS: DW_TAG_variable +; SECTIONS: DW_TAG_variable +; SECTIONS-NEXT: DW_AT_location +; SECTIONS-NEXT: DW_OP_consts +0, DW_OP_stack_value +; SECTIONS-NEXT: DW_OP_consts +7, DW_OP_stack_value +; SECTIONS-NEXT: DW_OP_consts +7, DW_OP_stack_value +; SECTIONS-NEXT: DW_OP_consts +8, DW_OP_stack_value +; SECTIONS-NEXT: DW_AT_name ("i") + +; void f1(int *); +; void f2(int); +; extern bool b; +; int test() { +; // i is not a const throughout the whole scope and +; // should *not* use DW_AT_const_value. The scope +; // creates basic block sections and should use +; // DW_AT_ranges. +; int i = 0; +; int j = 10; +; { +; i = 7; +; if (b) +; f1(&j); +; } +; i = 8; +; f2(i); +; return j; +; } +; clang++ -S scoped_section.cc -g -O2 -emit-llvm + + +@b = external local_unnamed_addr global i8, align 1 + +; Function Attrs: mustprogress uwtable +define dso_local noundef i32 @_Z4testv() local_unnamed_addr #0 !dbg !10 { +entry: + %j = alloca i32, align 4, !DIAssignID !17 + #dbg_assign(i1 undef, !16, !DIExpression(), !17, ptr %j, !DIExpression(), !18) + #dbg_value(i32 0, !15, !DIExpression(), !18) + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %j) #3, !dbg !19 + store i32 10, ptr %j, align 4, !dbg !20, !tbaa !21, !DIAssignID !25 + #dbg_assign(i32 10, !16, !DIExpression(), !25, ptr %j, !DIExpression(), !18) + #dbg_value(i32 7, !15, !DIExpression(), !18) + %0 = load i8, ptr @b, align 1, !dbg !26, !tbaa !29, !range !31, !noundef !32 + %loadedv = trunc nuw i8 %0 to i1, !dbg !26 + br i1 %loadedv, label %if.then, label %if.end, !dbg !33 + +if.then: ; preds = %entry + call void @_Z2f1Pi(ptr noundef nonnull %j), !dbg !34 + br label %if.end, !dbg !34 + +if.end: ; preds = %if.then, %entry + #dbg_value(i32 8, !15, !DIExpression(), !18) + call void @_Z2f2i(i32 noundef 8), !dbg !35 + %1 = load i32, ptr %j, align 4, !dbg !36, !tbaa !21 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %j) #3, !dbg !37 + ret i32 %1, !dbg !38 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +declare !dbg !39 void @_Z2f1Pi(ptr noundef) local_unnamed_addr #2 + +declare !dbg !43 void @_Z2f2i(i32 noundef) local_unnamed_addr #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} +!llvm.ident = !{!9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (git@github.com:tmsri/llvm-project.git 11a50269e82b6dce49249c5cbe3a989b06f0848f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "scoped_section.cc", directory: "", checksumkind: CSK_MD5, checksum: "9dda8d16c16edf7724c901692e07587c") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"debug-info-assignment-tracking", i1 true} +!9 = !{!"clang version 20.0.0git (git@github.com:tmsri/llvm-project.git 11a50269e82b6dce49249c5cbe3a989b06f0848f)"} +!10 = distinct !DISubprogram(name: "test", linkageName: "_Z4testv", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14) +!11 = !DISubroutineType(types: !12) +!12 = !{!13} +!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!14 = !{!15, !16} +!15 = !DILocalVariable(name: "i", scope: !10, file: !1, line: 7, type: !13) +!16 = !DILocalVariable(name: "j", scope: !10, file: !1, line: 8, type: !13) +!17 = distinct !DIAssignID() +!18 = !DILocation(line: 0, scope: !10) +!19 = !DILocation(line: 8, column: 5, scope: !10) +!20 = !DILocation(line: 8, column: 9, scope: !10) +!21 = !{!22, !22, i64 0} +!22 = !{!"int", !23, i64 0} +!23 = !{!"omnipotent char", !24, i64 0} +!24 = !{!"Simple C++ TBAA"} +!25 = distinct !DIAssignID() +!26 = !DILocation(line: 11, column: 11, scope: !27) +!27 = distinct !DILexicalBlock(scope: !28, file: !1, line: 11, column: 11) +!28 = distinct !DILexicalBlock(scope: !10, file: !1, line: 9, column: 5) +!29 = !{!30, !30, i64 0} +!30 = !{!"bool", !23, i64 0} +!31 = !{i8 0, i8 2} +!32 = !{} +!33 = !DILocation(line: 11, column: 11, scope: !28) +!34 = !DILocation(line: 12, column: 9, scope: !27) +!35 = !DILocation(line: 15, column: 5, scope: !10) +!36 = !DILocation(line: 16, column: 12, scope: !10) +!37 = !DILocation(line: 17, column: 1, scope: !10) +!38 = !DILocation(line: 16, column: 5, scope: !10) +!39 = !DISubprogram(name: "f1", linkageName: "_Z2f1Pi", scope: !1, file: !1, line: 1, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!40 = !DISubroutineType(types: !41) +!41 = !{null, !42} +!42 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64) +!43 = !DISubprogram(name: "f2", linkageName: "_Z2f2i", scope: !1, file: !1, line: 2, type: !44, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!44 = !DISubroutineType(types: !45) +!45 = !{null, !13} \ No newline at end of file diff --git a/llvm/test/DebugInfo/X86/basic-block-sections_1.ll b/llvm/test/DebugInfo/X86/basic-block-sections_1.ll index 12b60c4dc321b..c90d715142ec8 100644 --- a/llvm/test/DebugInfo/X86/basic-block-sections_1.ll +++ b/llvm/test/DebugInfo/X86/basic-block-sections_1.ll @@ -16,10 +16,10 @@ ; NO-SECTIONS: DW_AT_high_pc [DW_FORM_data4] ({{.*}}) ; BB-SECTIONS: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000) ; BB-SECTIONS-NEXT: DW_AT_ranges [DW_FORM_sec_offset] +; BB-SECTIONS-NEXT: [{{.*}}) ".text.hot._Z3fooi" ; BB-SECTIONS-NEXT: [{{.*}}) ".text.hot._Z3fooi._Z3fooi.__part.1" ; BB-SECTIONS-NEXT: [{{.*}}) ".text.hot._Z3fooi._Z3fooi.__part.2" ; BB-SECTIONS-NEXT: [{{.*}}) ".text.hot._Z3fooi._Z3fooi.__part.3" -; BB-SECTIONS-NEXT: [{{.*}}) ".text.hot._Z3fooi" ; BB-SECTIONS-ASM: _Z3fooi: ; BB-SECTIONS-ASM: .Ltmp{{[0-9]+}}: ; BB-SECTIONS-ASM-NEXT: .loc 1 2 9 prologue_end @@ -36,14 +36,14 @@ ; BB-SECTIONS-ASM: .size _Z3fooi.__part.3, .LBB_END0_{{[0-9]+}}-_Z3fooi.__part.3 ; BB-SECTIONS-ASM: .Lfunc_end0: ; BB-SECTIONS-ASM: .Ldebug_ranges0: +; BB-SECTIONS-ASM-NEXT: .quad .Lfunc_begin0 +; BB-SECTIONS-ASM-NEXT: .quad .Lfunc_end0 ; BB-SECTIONS-ASM-NEXT: .quad _Z3fooi.__part.1 ; BB-SECTIONS-ASM-NEXT: .quad .LBB_END0_{{[0-9]+}} ; BB-SECTIONS-ASM-NEXT: .quad _Z3fooi.__part.2 ; BB-SECTIONS-ASM-NEXT: .quad .LBB_END0_{{[0-9]+}} ; BB-SECTIONS-ASM-NEXT: .quad _Z3fooi.__part.3 ; BB-SECTIONS-ASM-NEXT: .quad .LBB_END0_{{[0-9]+}} -; BB-SECTIONS-ASM-NEXT: .quad .Lfunc_begin0 -; BB-SECTIONS-ASM-NEXT: .quad .Lfunc_end0 ; BB-SECTIONS-ASM-NEXT: .quad 0 ; BB-SECTIONS-ASM-NEXT: .quad 0 ; BB-SECTIONS-LINE-TABLE: 0x0000000000000000 1 0 1 0 0 0 is_stmt diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_pdata_strip.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_pdata_strip.s index e3a752df471c2..a9caf7a763806 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_pdata_strip.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_pdata_strip.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t -# RUN: +# RUN: # RUN: llvm-jitlink -abs __ImageBase=0xdeadbeaf -noexec %t \ # RUN: -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \ # RUN: -show-graphs='.*' -noexec 2>&1 | FileCheck %s @@ -14,7 +14,7 @@ # CHECK-EMPTY: .text - + .def main; .scl 2; .type 32; @@ -31,7 +31,7 @@ main: .type 32; .endef .p2align 4, 0x90 -func: +func: .seh_proc func subq $40, %rsp .seh_stackalloc 40 diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll new file mode 100644 index 0000000000000..2505a3e95fe02 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_tbl.ll @@ -0,0 +1,877 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2 +; Test memory sanitizer instrumentation for Arm NEON tbl instructions. +; +; RUN: opt < %s -passes=msan -S | FileCheck %s +; +; Forked from llvm/test/CodeGen/AArch64/arm64-tbl.ll + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android9001" + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind sanitize_memory { +; CHECK-LABEL: define <8 x i8> @tbl1_8b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to i64 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: 5: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: 6: +; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[OUT]] +; + %out = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B) + ret <8 x i8> %out +} + +define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind sanitize_memory { +; CHECK-LABEL: define <16 x i8> @tbl1_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[OUT]] +; + %out = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B) + ret <16 x i8> %out +} + +define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) sanitize_memory { +; CHECK-LABEL: define <8 x i8> @tbl2_8b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[OUT]] +; + %out = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) + ret <8 x i8> %out +} + +define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @tbl2_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[OUT]] +; + %out = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) + ret <16 x i8> %out +} + +define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) sanitize_memory { +; CHECK-LABEL: define <8 x i8> @tbl3_8b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to i64 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]]) +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[OUT]] +; + %out = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) + ret <8 x i8> %out +} + +define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @tbl3_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] +; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[OUT]] +; + %out = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) + ret <16 x i8> %out +} + +define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) sanitize_memory { +; CHECK-LABEL: define <8 x i8> @tbl4_8b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to i64 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]]) +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[OUT]] +; + %out = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) + ret <8 x i8> %out +} + +define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @tbl4_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[OUT]] +; + %out = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) + ret <16 x i8> %out +} + + + +define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory { +; CHECK-LABEL: define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8 +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[T1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[A]], <16 x i8> [[B]], <8 x i8> ) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]] +; CHECK-NEXT: br i1 [[_MSOR4]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: [[T2:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> ) +; CHECK-NEXT: [[S:%.*]] = shufflevector <8 x i8> [[T1]], <8 x i8> [[T2]], <8 x i32> +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[S]] +; + %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> ) + %t2 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %c, <16 x i8> %d, <8 x i8> ) + %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> + ret <8 x i8> %s +} + + + +define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4 +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer +; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> +; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + +define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i8> , i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1 +; CHECK-NEXT: [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2 +; CHECK-NEXT: [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3 +; CHECK-NEXT: [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4 +; CHECK-NEXT: [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5 +; CHECK-NEXT: [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6 +; CHECK-NEXT: [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7 +; CHECK-NEXT: [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8 +; CHECK-NEXT: [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8 +; CHECK-NEXT: [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9 +; CHECK-NEXT: [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9 +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10 +; CHECK-NEXT: [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10 +; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11 +; CHECK-NEXT: [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12 +; CHECK-NEXT: [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12 +; CHECK-NEXT: [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13 +; CHECK-NEXT: [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13 +; CHECK-NEXT: [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 0, i32 14 +; CHECK-NEXT: [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14 +; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 0, i32 15 +; CHECK-NEXT: [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15 +; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], [[_MSPROP15]] +; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]]) +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], zeroinitializer +; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> +; CHECK-NEXT: store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 + %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 + %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 + %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3 + %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4 + %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5 + %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6 + %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7 + %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8 + %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9 + %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10 + %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11 + %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12 + %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13 + %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14 + %ins.15 = insertelement <16 x i8> %ins.14, i8 -1, i32 15 + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + +define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2 +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 1, i32 0 +; CHECK-NEXT: [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 1, i32 1 +; CHECK-NEXT: [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 1, i32 2 +; CHECK-NEXT: [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 1, i32 3 +; CHECK-NEXT: [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 1, i32 4 +; CHECK-NEXT: [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 1, i32 5 +; CHECK-NEXT: [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 1, i32 6 +; CHECK-NEXT: [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 1, i32 7 +; CHECK-NEXT: [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8 +; CHECK-NEXT: [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9 +; CHECK-NEXT: [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10 +; CHECK-NEXT: [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11 +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i8> , i8 [[TMP1]], i32 12 +; CHECK-NEXT: [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 [[V]], i32 12 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 13 +; CHECK-NEXT: [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 [[V]], i32 13 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 0, i32 14 +; CHECK-NEXT: [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 15 +; CHECK-NEXT: [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15 +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP5:%.*]] = or <16 x i8> [[_MSPROP4]], [[_MSPROP3]] +; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]]) +; CHECK-NEXT: [[_MSPROP6:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP7:%.*]] = or <16 x i8> [[_MSPROP6]], zeroinitializer +; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP8:%.*]] = shufflevector <16 x i8> [[_MSPROP5]], <16 x i8> [[_MSPROP7]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> +; CHECK-NEXT: store <16 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0 + %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1 + %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2 + %ins.3 = insertelement <16 x i8> %ins.2, i8 1, i32 3 + %ins.4 = insertelement <16 x i8> %ins.3, i8 1, i32 4 + %ins.5 = insertelement <16 x i8> %ins.4, i8 1, i32 5 + %ins.6 = insertelement <16 x i8> %ins.5, i8 1, i32 6 + %ins.7 = insertelement <16 x i8> %ins.6, i8 1, i32 7 + %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8 + %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9 + %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10 + %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11 + %ins.12 = insertelement <16 x i8> %ins.11, i8 %v, i32 12 + %ins.13 = insertelement <16 x i8> %ins.12, i8 %v, i32 13 + %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14 + %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15 + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + + +define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i8> , i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1 +; CHECK-NEXT: [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2 +; CHECK-NEXT: [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3 +; CHECK-NEXT: [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4 +; CHECK-NEXT: [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5 +; CHECK-NEXT: [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6 +; CHECK-NEXT: [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7 +; CHECK-NEXT: [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8 +; CHECK-NEXT: [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8 +; CHECK-NEXT: [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9 +; CHECK-NEXT: [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9 +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10 +; CHECK-NEXT: [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10 +; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11 +; CHECK-NEXT: [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12 +; CHECK-NEXT: [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12 +; CHECK-NEXT: [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13 +; CHECK-NEXT: [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13 +; CHECK-NEXT: [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 0, i32 14 +; CHECK-NEXT: [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 -1, i32 14 +; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 0, i32 15 +; CHECK-NEXT: [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 -1, i32 15 +; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], zeroinitializer +; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], [[_MSPROP15]] +; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]]) +; CHECK-NEXT: [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> +; CHECK-NEXT: store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 + %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 + %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 + %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3 + %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4 + %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5 + %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6 + %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7 + %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8 + %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9 + %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10 + %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11 + %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12 + %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13 + %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14 + %ins.15 = insertelement <16 x i8> %ins.14, i8 -1, i32 15 + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + + +define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2 +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = insertelement <16 x i8> , i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0 +; CHECK-NEXT: [[_MSPROP1:%.*]] = insertelement <16 x i8> [[_MSPROP]], i8 [[TMP1]], i32 1 +; CHECK-NEXT: [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 [[V]], i32 1 +; CHECK-NEXT: [[_MSPROP2:%.*]] = insertelement <16 x i8> [[_MSPROP1]], i8 [[TMP1]], i32 2 +; CHECK-NEXT: [[INS_2:%.*]] = insertelement <16 x i8> [[INS_1]], i8 [[V]], i32 2 +; CHECK-NEXT: [[_MSPROP3:%.*]] = insertelement <16 x i8> [[_MSPROP2]], i8 [[TMP1]], i32 3 +; CHECK-NEXT: [[INS_3:%.*]] = insertelement <16 x i8> [[INS_2]], i8 [[V]], i32 3 +; CHECK-NEXT: [[_MSPROP4:%.*]] = insertelement <16 x i8> [[_MSPROP3]], i8 [[TMP1]], i32 4 +; CHECK-NEXT: [[INS_4:%.*]] = insertelement <16 x i8> [[INS_3]], i8 [[V]], i32 4 +; CHECK-NEXT: [[_MSPROP5:%.*]] = insertelement <16 x i8> [[_MSPROP4]], i8 [[TMP1]], i32 5 +; CHECK-NEXT: [[INS_5:%.*]] = insertelement <16 x i8> [[INS_4]], i8 [[V]], i32 5 +; CHECK-NEXT: [[_MSPROP6:%.*]] = insertelement <16 x i8> [[_MSPROP5]], i8 [[TMP1]], i32 6 +; CHECK-NEXT: [[INS_6:%.*]] = insertelement <16 x i8> [[INS_5]], i8 [[V]], i32 6 +; CHECK-NEXT: [[_MSPROP7:%.*]] = insertelement <16 x i8> [[_MSPROP6]], i8 [[TMP1]], i32 7 +; CHECK-NEXT: [[INS_7:%.*]] = insertelement <16 x i8> [[INS_6]], i8 [[V]], i32 7 +; CHECK-NEXT: [[_MSPROP8:%.*]] = insertelement <16 x i8> [[_MSPROP7]], i8 0, i32 8 +; CHECK-NEXT: [[INS_8:%.*]] = insertelement <16 x i8> [[INS_7]], i8 -1, i32 8 +; CHECK-NEXT: [[_MSPROP9:%.*]] = insertelement <16 x i8> [[_MSPROP8]], i8 0, i32 9 +; CHECK-NEXT: [[INS_9:%.*]] = insertelement <16 x i8> [[INS_8]], i8 -1, i32 9 +; CHECK-NEXT: [[_MSPROP10:%.*]] = insertelement <16 x i8> [[_MSPROP9]], i8 0, i32 10 +; CHECK-NEXT: [[INS_10:%.*]] = insertelement <16 x i8> [[INS_9]], i8 -1, i32 10 +; CHECK-NEXT: [[_MSPROP11:%.*]] = insertelement <16 x i8> [[_MSPROP10]], i8 0, i32 11 +; CHECK-NEXT: [[INS_11:%.*]] = insertelement <16 x i8> [[INS_10]], i8 -1, i32 11 +; CHECK-NEXT: [[_MSPROP12:%.*]] = insertelement <16 x i8> [[_MSPROP11]], i8 0, i32 12 +; CHECK-NEXT: [[INS_12:%.*]] = insertelement <16 x i8> [[INS_11]], i8 -1, i32 12 +; CHECK-NEXT: [[_MSPROP13:%.*]] = insertelement <16 x i8> [[_MSPROP12]], i8 0, i32 13 +; CHECK-NEXT: [[INS_13:%.*]] = insertelement <16 x i8> [[INS_12]], i8 -1, i32 13 +; CHECK-NEXT: [[_MSPROP14:%.*]] = insertelement <16 x i8> [[_MSPROP13]], i8 [[TMP1]], i32 14 +; CHECK-NEXT: [[INS_14:%.*]] = insertelement <16 x i8> [[INS_13]], i8 [[V]], i32 14 +; CHECK-NEXT: [[_MSPROP15:%.*]] = insertelement <16 x i8> [[_MSPROP14]], i8 [[TMP1]], i32 15 +; CHECK-NEXT: [[INS_15:%.*]] = insertelement <16 x i8> [[INS_14]], i8 [[V]], i32 15 +; CHECK-NEXT: [[_MSPROP16:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP17:%.*]] = or <16 x i8> [[_MSPROP16]], zeroinitializer +; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP18:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP19:%.*]] = or <16 x i8> [[_MSPROP18]], [[_MSPROP15]] +; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[INS_15]]) +; CHECK-NEXT: [[_MSPROP20:%.*]] = shufflevector <16 x i8> [[_MSPROP17]], <16 x i8> [[_MSPROP19]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> +; CHECK-NEXT: store <16 x i8> [[_MSPROP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0 + %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1 + %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2 + %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3 + %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4 + %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5 + %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6 + %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7 + %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8 + %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9 + %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10 + %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11 + %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12 + %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13 + %ins.14 = insertelement <16 x i8> %ins.13, i8 %v, i32 14 + %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15 + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + + +define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer +; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> +; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + + +define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1 +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer +; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> +; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + + + +define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2 +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer +; CHECK-NEXT: [[T1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], zeroinitializer +; CHECK-NEXT: [[T2:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> ) +; CHECK-NEXT: [[_MSPROP4:%.*]] = shufflevector <16 x i8> [[_MSPROP1]], <16 x i8> [[_MSPROP3]], <16 x i32> +; CHECK-NEXT: [[S:%.*]] = shufflevector <16 x i8> [[T1]], <16 x i8> [[T2]], <16 x i32> +; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[S]] +; + %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) + %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> ) + %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> + ret <16 x i8> %s +} + +declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind sanitize_memory readnone +declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind sanitize_memory readnone +declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone +declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone +declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone +declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone +declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone +declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone + +define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind sanitize_memory { +; CHECK-LABEL: define <8 x i8> @tbx1_8b +; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] +; CHECK: 7: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 8: +; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[OUT]] +; + %out = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) + ret <8 x i8> %out +} + +define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind sanitize_memory { +; CHECK-LABEL: define <16 x i8> @tbx1_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[OUT]] +; + %out = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) + ret <16 x i8> %out +} + +define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) sanitize_memory { +; CHECK-LABEL: define <8 x i8> @tbx2_8b +; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to i64 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <8 x i8> [[D]]) +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[OUT]] +; + %out = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) + ret <8 x i8> %out +} + +define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @tbx2_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] +; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP2]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[OUT]] +; + %out = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) + ret <16 x i8> %out +} + +define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) sanitize_memory { +; CHECK-LABEL: define <8 x i8> @tbx3_8b +; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to i64 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <8 x i8> [[E]]) +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[OUT]] +; + %out = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) + ret <8 x i8> %out +} + +define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @tbx3_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP3]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[OUT]] +; + %out = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) + ret <16 x i8> %out +} + +define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) sanitize_memory { +; CHECK-LABEL: define <8 x i8> @tbx4_8b +; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <8 x i8> [[F:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP1]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP4]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP5]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to i64 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP12]], 0 +; CHECK-NEXT: [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]] +; CHECK-NEXT: br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[OUT:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <8 x i8> [[F]]) +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i8> [[OUT]] +; + %out = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) + ret <8 x i8> %out +} + +define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) sanitize_memory { +; CHECK-LABEL: define <16 x i8> @tbx4_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <16 x i8> [[F:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]] +; CHECK-NEXT: [[_MSPROP2:%.*]] = or <16 x i8> [[_MSPROP1]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i8> [[_MSPROP2]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP4:%.*]] = or <16 x i8> [[_MSPROP3]], [[TMP6]] +; CHECK-NEXT: [[OUT:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], <16 x i8> [[E]], <16 x i8> [[F]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP4]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[OUT]] +; + %out = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) + ret <16 x i8> %out +} + +declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone +declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone +declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone +declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone +declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone +declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone +declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind sanitize_memory readnone +declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind sanitize_memory readnone diff --git a/llvm/test/MC/AArch64/SME/revd-diagnostics.s b/llvm/test/MC/AArch64/SME/revd-diagnostics.s index 42205c26ce93f..e7242e5dc1bbb 100644 --- a/llvm/test/MC/AArch64/SME/revd-diagnostics.s +++ b/llvm/test/MC/AArch64/SME/revd-diagnostics.s @@ -11,7 +11,7 @@ revd z0.q, p8/m, z0.q // wrong predication qualifier, expected /m. revd z0.q, p0/z, z0.q -// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction requires: sme2p2 or sve2p2 // CHECK-NEXT: revd z0.q, p0/z, z0.q // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/bfmop4as-non-widening-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/bfmop4as-non-widening-diagnostics.s new file mode 100644 index 0000000000000..231d4cd9967a4 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/bfmop4as-non-widening-diagnostics.s @@ -0,0 +1,220 @@ +// RUN: not llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-b16b16 < %s 2>&1 | FileCheck %s + +// BFMOP4A + +// Single vectors + +bfmop4a za0.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s + +bfmop4a za4.h, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4a za0.h, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4a za0.h, z15.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4a za0.h, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4a za0.h, z0.h, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4a za0.h, z12.h, z17.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4a za0.h, z12.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4a za0.h, z12.h, z31.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +// Single and multiple vectors + +bfmop4a za0.d, z0.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s + +bfmop4a za4.h, z0.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4a za0.h, z0.s, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4a za0.h, z1.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4a za0.h, z16.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4a za0.h, z0.h, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4a za0.h, z0.h, {z17.h-z18.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +bfmop4a za0.h, z0.h, {z12.h-z13.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// Multiple and single vectors + +bfmop4a za0.d, {z0.h-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s + +bfmop4a za4.h, {z0.h-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4a za0.h, {z0.s-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix + +bfmop4a za0.h, {z1.h-z2.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +bfmop4a za0.h, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +bfmop4a za0.h, {z0.h-z1.h}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4a za0.h, {z0.h-z1.h}, z17.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4a za0.h, {z0.h-z1.h}, z12.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +// Multiple vectors + +bfmop4a za0.d, {z0.h-z1.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s + +bfmop4a za4.h, {z0.h-z1.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4a za0.h, {z0.s-z1.s}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4a za0.h, {z1.h-z2.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +bfmop4a za0.h, {z18.h-z19.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +bfmop4a za0.h, {z0.h-z1.h}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4a za0.h, {z0.h-z1.h}, {z19.h-z20.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +bfmop4a za0.h, {z0.h-z1.h}, {z10.h-z11.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + + +// BFMOP4S + +// Single vectors + +bfmop4s za0.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s + +bfmop4s za4.h, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4s za0.h, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4s za0.h, z15.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4s za0.h, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4s za0.h, z0.h, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4s za0.h, z12.h, z17.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4s za0.h, z12.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4s za0.h, z12.h, z31.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +// Single and multiple vectors + +bfmop4s za0.d, z0.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s + +bfmop4s za4.h, z0.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4s za0.h, z0.s, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4s za0.h, z1.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4s za0.h, z16.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h + +bfmop4s za0.h, z0.h, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4s za0.h, z0.h, {z17.h-z18.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +bfmop4s za0.h, z0.h, {z12.h-z13.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// Multiple and single vectors + +bfmop4s za0.d, {z0.h-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s + +bfmop4s za4.h, {z0.h-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4s za0.h, {z0.s-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix + +bfmop4s za0.h, {z1.h-z2.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +bfmop4s za0.h, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +bfmop4s za0.h, {z0.h-z1.h}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4s za0.h, {z0.h-z1.h}, z17.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +bfmop4s za0.h, {z0.h-z1.h}, z12.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +// Multiple vectors + +bfmop4s za0.d, {z0.h-z1.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s + +bfmop4s za4.h, {z0.h-z1.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4s za0.h, {z0.s-z1.s}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4s za0.h, {z1.h-z2.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +bfmop4s za0.h, {z18.h-z19.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +bfmop4s za0.h, {z0.h-z1.h}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +bfmop4s za0.h, {z0.h-z1.h}, {z19.h-z20.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +bfmop4s za0.h, {z0.h-z1.h}, {z10.h-z11.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types diff --git a/llvm/test/MC/AArch64/SME2p2/bfmop4as-non-widening.s b/llvm/test/MC/AArch64/SME2p2/bfmop4as-non-widening.s new file mode 100644 index 0000000000000..b98bb99def056 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/bfmop4as-non-widening.s @@ -0,0 +1,178 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-b16b16 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-b16b16 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-b16b16 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-b16b16 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-b16b16 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-b16b16 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +// BFMOP4A + +// Single vectors + +bfmop4a za0.h, z0.h, z16.h // 10000001-00100000-00000000-00001000 +// CHECK-INST: bfmop4a za0.h, z0.h, z16.h +// CHECK-ENCODING: [0x08,0x00,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81200008 + +bfmop4a za1.h, z12.h, z24.h // 10000001-00101000-00000001-10001001 +// CHECK-INST: bfmop4a za1.h, z12.h, z24.h +// CHECK-ENCODING: [0x89,0x01,0x28,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81280189 + +bfmop4a za1.h, z14.h, z30.h // 10000001-00101110-00000001-11001001 +// CHECK-INST: bfmop4a za1.h, z14.h, z30.h +// CHECK-ENCODING: [0xc9,0x01,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 812e01c9 + +// Single and multiple vectors + +bfmop4a za0.h, z0.h, {z16.h-z17.h} // 10000001-00110000-00000000-00001000 +// CHECK-INST: bfmop4a za0.h, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x00,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81300008 + +bfmop4a za1.h, z12.h, {z24.h-z25.h} // 10000001-00111000-00000001-10001001 +// CHECK-INST: bfmop4a za1.h, z12.h, { z24.h, z25.h } +// CHECK-ENCODING: [0x89,0x01,0x38,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81380189 + +bfmop4a za1.h, z14.h, {z30.h-z31.h} // 10000001-00111110-00000001-11001001 +// CHECK-INST: bfmop4a za1.h, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xc9,0x01,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 813e01c9 + +// Multiple and single vectors + +bfmop4a za0.h, {z0.h-z1.h}, z16.h // 10000001-00100000-00000010-00001000 +// CHECK-INST: bfmop4a za0.h, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x08,0x02,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81200208 + +bfmop4a za1.h, {z12.h-z13.h}, z24.h // 10000001-00101000-00000011-10001001 +// CHECK-INST: bfmop4a za1.h, { z12.h, z13.h }, z24.h +// CHECK-ENCODING: [0x89,0x03,0x28,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81280389 + +bfmop4a za1.h, {z14.h-z15.h}, z30.h // 10000001-00101110-00000011-11001001 +// CHECK-INST: bfmop4a za1.h, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xc9,0x03,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 812e03c9 + +// Multiple vectors + +bfmop4a za0.h, {z0.h-z1.h}, {z16.h-z17.h} // 10000001-00110000-00000010-00001000 +// CHECK-INST: bfmop4a za0.h, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x02,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81300208 + +bfmop4a za1.h, {z12.h-z13.h}, {z24.h-z25.h} // 10000001-00111000-00000011-10001001 +// CHECK-INST: bfmop4a za1.h, { z12.h, z13.h }, { z24.h, z25.h } +// CHECK-ENCODING: [0x89,0x03,0x38,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81380389 + +bfmop4a za1.h, {z14.h-z15.h}, {z30.h-z31.h} // 10000001-00111110-00000011-11001001 +// CHECK-INST: bfmop4a za1.h, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xc9,0x03,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 813e03c9 + + +// BFMOP4S + +// Single vectors + +bfmop4s za0.h, z0.h, z16.h // 10000001-00100000-00000000-00011000 +// CHECK-INST: bfmop4s za0.h, z0.h, z16.h +// CHECK-ENCODING: [0x18,0x00,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81200018 + +bfmop4s za1.h, z12.h, z24.h // 10000001-00101000-00000001-10011001 +// CHECK-INST: bfmop4s za1.h, z12.h, z24.h +// CHECK-ENCODING: [0x99,0x01,0x28,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81280199 + +bfmop4s za1.h, z14.h, z30.h // 10000001-00101110-00000001-11011001 +// CHECK-INST: bfmop4s za1.h, z14.h, z30.h +// CHECK-ENCODING: [0xd9,0x01,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 812e01d9 + +// Single and multiple vectors + +bfmop4s za0.h, z0.h, {z16.h-z17.h} // 10000001-00110000-00000000-00011000 +// CHECK-INST: bfmop4s za0.h, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x00,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81300018 + +bfmop4s za1.h, z12.h, {z24.h-z25.h} // 10000001-00111000-00000001-10011001 +// CHECK-INST: bfmop4s za1.h, z12.h, { z24.h, z25.h } +// CHECK-ENCODING: [0x99,0x01,0x38,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81380199 + +bfmop4s za1.h, z14.h, {z30.h-z31.h} // 10000001-00111110-00000001-11011001 +// CHECK-INST: bfmop4s za1.h, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xd9,0x01,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 813e01d9 + +// Multiple and single vectors + +bfmop4s za0.h, {z0.h-z1.h}, z16.h // 10000001-00100000-00000010-00011000 +// CHECK-INST: bfmop4s za0.h, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x18,0x02,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81200218 + +bfmop4s za1.h, {z12.h-z13.h}, z24.h // 10000001-00101000-00000011-10011001 +// CHECK-INST: bfmop4s za1.h, { z12.h, z13.h }, z24.h +// CHECK-ENCODING: [0x99,0x03,0x28,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81280399 + +bfmop4s za1.h, {z14.h-z15.h}, z30.h // 10000001-00101110-00000011-11011001 +// CHECK-INST: bfmop4s za1.h, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xd9,0x03,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 812e03d9 + +// Multiple vectors + +bfmop4s za0.h, {z0.h-z1.h}, {z16.h-z17.h} // 10000001-00110000-00000010-00011000 +// CHECK-INST: bfmop4s za0.h, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x02,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81300218 + +bfmop4s za1.h, {z12.h-z13.h}, {z24.h-z25.h} // 10000001-00111000-00000011-10011001 +// CHECK-INST: bfmop4s za1.h, { z12.h, z13.h }, { z24.h, z25.h } +// CHECK-ENCODING: [0x99,0x03,0x38,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 81380399 + +bfmop4s za1.h, {z14.h-z15.h}, {z30.h-z31.h} // 10000001-00111110-00000011-11011001 +// CHECK-INST: bfmop4s za1.h, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xd9,0x03,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 sme-b16b16 +// CHECK-UNKNOWN: 813e03d9 diff --git a/llvm/test/MC/AArch64/SME2p2/fmop4as-fp16-fp32-widening-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp16-fp32-widening-diagnostics.s new file mode 100644 index 0000000000000..457add20355e8 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp16-fp32-widening-diagnostics.s @@ -0,0 +1,243 @@ +// RUN: not llvm-mc -triple=aarch64 -mattr=+sme2p2 < %s 2>&1 | FileCheck %s + +// FMOP4A + +// Single vectors + +fmop4a za0.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4a za4.s, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, z0.d, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4a za0.s, z15.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4a za0.s, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4a za0.s, z0.h, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4a za0.s, z12.h, z17.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4a za0.s, z12.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4a za0.s, z12.h, z31.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +// Single and multiple vectors + +fmop4a za0.d, z0.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4a za4.s, z0.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, z0.d, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4a za0.s, z1.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4a za0.s, z16.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4a za0.s, z0.h, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, z0.h, {z17.h-z18.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, z0.h, {z16.h-z18.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, z0.h, {z12.h-z13.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// Multiple and single vectors + +fmop4a za0.d, {z0.h-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4a za4.s, {z0.h-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z0.d-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix + +fmop4a za0.s, {z1.h-z2.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za4.s, {z0.h-z2.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z0.h-z1.h}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4a za0.s, {z0.h-z1.h}, z17.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4a za0.s, {z0.h-z1.h}, z12.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +// Multiple vectors + +fmop4a za0.d, {z0.h-z1.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4a za4.s, {z0.h-z1.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z0.d-z1.d}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z1.h-z2.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z0.h-z2.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z18.h-z19.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z0.h-z1.h}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z0.h-z1.h}, {z19.h-z20.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z0.h-z1.h}, {z18.h-z20.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z0.h-z1.h}, {z10.h-z11.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// FMOP4S + +// Single vectors + +fmop4a za0.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4s za4.s, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, z0.d, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4s za0.s, z15.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4s za0.s, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4s za0.s, z0.h, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4s za0.s, z12.h, z17.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4s za0.s, z12.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4s za0.s, z12.h, z31.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +// Single and multiple vectors + +fmop4s za0.d, z0.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4s za4.s, z0.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, z0.d, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4s za0.s, z1.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4s za0.s, z16.h, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register + +fmop4s za0.s, z0.h, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, z0.h, {z17.h-z18.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, z0.h, {z16.h-z18.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, z0.h, {z12.h-z13.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// Multiple and single vectors + +fmop4s za0.d, {z0.h-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4s za4.s, {z0.h-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z0.d-z1.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix + +fmop4s za0.s, {z1.h-z2.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z0.h-z2.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z0.h-z1.h}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4s za0.s, {z0.h-z1.h}, z17.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +fmop4s za0.s, {z0.h-z1.h}, z12.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h + +// Multiple vectors + +fmop4s za0.d, {z0.h-z1.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4s za4.s, {z0.h-z1.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z0.d-z1.d}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z1.h-z2.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z0.h-z2.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z18.h-z19.h}, {z16.h-z17.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z0.h-z1.h}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z0.h-z1.h}, {z19.h-z20.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z0.h-z1.h}, {z18.h-z20.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z0.h-z1.h}, {z10.h-z11.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types diff --git a/llvm/test/MC/AArch64/SME2p2/fmop4as-fp16-fp32-widening.s b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp16-fp32-widening.s new file mode 100644 index 0000000000000..d615fb85b4fd7 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp16-fp32-widening.s @@ -0,0 +1,177 @@ + +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + + +// FMOP4A + +// Single vectors +fmop4a za0.s, z0.h, z16.h // 10000001-00100000-00000000-00000000 +// CHECK-INST: fmop4a za0.s, z0.h, z16.h +// CHECK-ENCODING: [0x00,0x00,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81200000 + +fmop4a za1.s, z10.h, z20.h // 10000001-00100100-00000001-01000001 +// CHECK-INST: fmop4a za1.s, z10.h, z20.h +// CHECK-ENCODING: [0x41,0x01,0x24,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81240141 + +fmop4a za3.s, z14.h, z30.h // 10000001-00101110-00000001-11000011 +// CHECK-INST: fmop4a za3.s, z14.h, z30.h +// CHECK-ENCODING: [0xc3,0x01,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 812e01c3 + +// Single and multiple vectors + +fmop4a za0.s, z0.h, {z16.h-z17.h} // 10000001-00110000-00000000-00000000 +// CHECK-INST: fmop4a za0.s, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x00,0x00,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81300000 + +fmop4a za1.s, z10.h, {z20.h-z21.h} // 10000001-00110100-00000001-01000001 +// CHECK-INST: fmop4a za1.s, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x41,0x01,0x34,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81340141 + +fmop4a za3.s, z14.h, {z30.h-z31.h} // 10000001-00111110-00000001-11000011 +// CHECK-INST: fmop4a za3.s, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xc3,0x01,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 813e01c3 + +// Multiple and single vectors + +fmop4a za0.s, {z0.h-z1.h}, z16.h // 10000001-00100000-00000010-00000000 +// CHECK-INST: fmop4a za0.s, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x00,0x02,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81200200 + +fmop4a za1.s, {z10.h-z11.h}, z20.h // 10000001-00100100-00000011-01000001 +// CHECK-INST: fmop4a za1.s, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x41,0x03,0x24,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81240341 + +fmop4a za3.s, {z14.h-z15.h}, z30.h // 10000001-00101110-00000011-11000011 +// CHECK-INST: fmop4a za3.s, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xc3,0x03,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 812e03c3 + +// Multiple vectors + +fmop4a za0.s, {z0.h-z1.h}, {z16.h-z17.h} // 10000001-00110000-00000010-00000000 +// CHECK-INST: fmop4a za0.s, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x00,0x02,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81300200 + +fmop4a za1.s, {z10.h-z11.h}, {z20.h-z21.h} // 10000001-00110100-00000011-01000001 +// CHECK-INST: fmop4a za1.s, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x41,0x03,0x34,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81340341 + +fmop4a za3.s, {z14.h-z15.h}, {z30.h-z31.h} // 10000001-00111110-00000011-11000011 +// CHECK-INST: fmop4a za3.s, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xc3,0x03,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 813e03c3 + +// FMOP4S + +// Single vectors +fmop4s za0.s, z0.h, z16.h // 10000001-00100000-00000000-00010000 +// CHECK-INST: fmop4s za0.s, z0.h, z16.h +// CHECK-ENCODING: [0x10,0x00,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81200010 + +fmop4s za1.s, z10.h, z20.h // 10000001-00100100-00000001-01010001 +// CHECK-INST: fmop4s za1.s, z10.h, z20.h +// CHECK-ENCODING: [0x51,0x01,0x24,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81240151 + +fmop4s za3.s, z14.h, z30.h // 10000001-00101110-00000001-11010011 +// CHECK-INST: fmop4s za3.s, z14.h, z30.h +// CHECK-ENCODING: [0xd3,0x01,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 812e01d3 + +// Single and multiple vectors + +fmop4s za0.s, z0.h, {z16.h-z17.h} // 10000001-00110000-00000000-00010000 +// CHECK-INST: fmop4s za0.s, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x10,0x00,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81300010 + +fmop4s za1.s, z10.h, {z20.h-z21.h} // 10000001-00110100-00000001-01010001 +// CHECK-INST: fmop4s za1.s, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x51,0x01,0x34,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81340151 + +fmop4s za3.s, z14.h, {z30.h-z31.h} // 10000001-00111110-00000001-11010011 +// CHECK-INST: fmop4s za3.s, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xd3,0x01,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 813e01d3 + +// Multiple and single vectors + +fmop4s za0.s, {z0.h-z1.h}, z16.h // 10000001-00100000-00000010-00010000 +// CHECK-INST: fmop4s za0.s, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x10,0x02,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81200210 + +fmop4s za1.s, {z10.h-z11.h}, z20.h // 10000001-00100100-00000011-01010001 +// CHECK-INST: fmop4s za1.s, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x51,0x03,0x24,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81240351 + +fmop4s za3.s, {z14.h-z15.h}, z30.h // 10000001-00101110-00000011-11010011 +// CHECK-INST: fmop4s za3.s, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xd3,0x03,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 812e03d3 + +// Multiple vectors + +fmop4s za0.s, {z0.h-z1.h}, {z16.h-z17.h} // 10000001-00110000-00000010-00010000 +// CHECK-INST: fmop4s za0.s, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x10,0x02,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81300210 + +fmop4s za1.s, {z10.h-z11.h}, {z20.h-z21.h} // 10000001-00110100-00000011-01010001 +// CHECK-INST: fmop4s za1.s, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x51,0x03,0x34,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81340351 + +fmop4s za3.s, {z14.h-z15.h}, {z30.h-z31.h} // 10000001-00111110-00000011-11010011 +// CHECK-INST: fmop4s za3.s, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xd3,0x03,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 813e03d3 diff --git a/llvm/test/MC/AArch64/SME2p2/fmop4as-fp32-non-widening-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp32-non-widening-diagnostics.s new file mode 100644 index 0000000000000..c9c59128f4206 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp32-non-widening-diagnostics.s @@ -0,0 +1,245 @@ +// RUN: not llvm-mc -triple=aarch64 -mattr=+sme2p2 < %s 2>&1 | FileCheck %s + +// FMOP4A + +// Single vectors + +fmop4a za0.d, z0.s, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4a za4.s, z0.s, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, z0.d, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4a za0.s, z15.s, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4a za0.s, z16.s, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4a za0.s, z0.s, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4a za0.s, z12.s, z17.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4a za0.s, z12.s, z14.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4a za0.s, z12.s, z31.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +// Single and multiple vectors + +fmop4a za0.d, z0.s, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4a za4.s, z0.s, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, z0.d, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4a za0.s, z1.s, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4a za0.s, z16.s, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4a za0.s, z0.s, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, z0.s, {z17.s-z18.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, z0.s, {z16.s-z18.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, z0.s, {z12.s-z13.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// Multiple and single vectors + +fmop4a za0.d, {z0.s-z1.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4a za4.s, {z0.s-z1.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z0.d-z1.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix + +fmop4a za0.s, {z1.s-z2.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z2.s-z4.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z16.s-z17.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z0.s-z1.s}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4a za0.s, {z0.s-z1.s}, z17.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4a za0.s, {z0.s-z1.s}, z12.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +// Multiple vectors + +fmop4a za0.d, {z0.s-z1.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4a za4.s, {z0.s-z1.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z0.d-z1.d}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z1.s-z2.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z2.s-z4.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z18.s-z19.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z0.s-z1.s}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z0.s-z1.s}, {z19.s-z20.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.s, {z0.s-z1.s}, {z16.s-z18.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.s, {z0.s-z1.s}, {z10.s-z11.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + + +// FMOP4S + +// Single vectors + +fmop4s za0.d, z0.s, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4s za4.s, z0.s, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, z0.d, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4s za0.s, z15.s, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4s za0.s, z16.s, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4s za0.s, z0.s, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4s za0.s, z12.s, z17.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4s za0.s, z12.s, z14.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4s za0.s, z12.s, z31.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +// Single and multiple vectors + +fmop4s za0.d, z0.s, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4s za4.s, z0.s, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, z0.d, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4s za0.s, z1.s, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4s za0.s, z16.s, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4s za0.s, z0.s, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, z0.s, {z17.s-z18.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, z0.s, {z16.s-z18.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, z0.s, {z12.s-z13.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// Multiple and single vectors + +fmop4s za0.d, {z0.s-z1.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4s za4.s, {z0.s-z1.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z0.d-z1.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix + +fmop4s za0.s, {z1.s-z2.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z2.s-z4.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z16.s-z17.s}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z0.s-z1.s}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4s za0.s, {z0.s-z1.s}, z17.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +fmop4s za0.s, {z0.s-z1.s}, z12.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.s..z30.s + +// Multiple vectors + +fmop4s za0.d, {z0.s-z1.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand + +fmop4s za4.s, {z0.s-z1.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z0.d-z1.d}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z1.s-z2.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z2.s-z4.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z18.s-z19.s}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z0.s-z1.s}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z0.s-z1.s}, {z19.s-z20.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.s, {z0.s-z1.s}, {z16.s-z18.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.s, {z0.s-z1.s}, {z10.s-z11.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + diff --git a/llvm/test/MC/AArch64/SME2p2/fmop4as-fp32-non-widening.s b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp32-non-widening.s new file mode 100644 index 0000000000000..e65def17cd1b3 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp32-non-widening.s @@ -0,0 +1,179 @@ + +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + + +// FMOP4A + +// Single vectors + +fmop4a za0.s, z0.s, z16.s // 10000000-00000000-00000000-00000000 +// CHECK-INST: fmop4a za0.s, z0.s, z16.s +// CHECK-ENCODING: [0x00,0x00,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80000000 + +fmop4a za3.s, z12.s, z24.s // 10000000-00001000-00000001-10000011 +// CHECK-INST: fmop4a za3.s, z12.s, z24.s +// CHECK-ENCODING: [0x83,0x01,0x08,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80080183 + +fmop4a za3.s, z14.s, z30.s // 10000000-00001110-00000001-11000011 +// CHECK-INST: fmop4a za3.s, z14.s, z30.s +// CHECK-ENCODING: [0xc3,0x01,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e01c3 + +// Single and multiple vectors + +fmop4a za0.s, z0.s, {z16.s-z17.s} // 10000000-00010000-00000000-00000000 +// CHECK-INST: fmop4a za0.s, z0.s, { z16.s, z17.s } +// CHECK-ENCODING: [0x00,0x00,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80100000 + +fmop4a za1.s, z10.s, {z20.s-z21.s} // 10000000-00010100-00000001-01000001 +// CHECK-INST: fmop4a za1.s, z10.s, { z20.s, z21.s } +// CHECK-ENCODING: [0x41,0x01,0x14,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80140141 + +fmop4a za3.s, z14.s, {z30.s-z31.s} // 10000000-00011110-00000001-11000011 +// CHECK-INST: fmop4a za3.s, z14.s, { z30.s, z31.s } +// CHECK-ENCODING: [0xc3,0x01,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e01c3 + +// Multiple and single vectors + +fmop4a za0.s, {z0.s-z1.s}, z16.s // 10000000-00000000-00000010-00000000 +// CHECK-INST: fmop4a za0.s, { z0.s, z1.s }, z16.s +// CHECK-ENCODING: [0x00,0x02,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80000200 + +fmop4a za1.s, {z10.s-z11.s}, z20.s // 10000000-00000100-00000011-01000001 +// CHECK-INST: fmop4a za1.s, { z10.s, z11.s }, z20.s +// CHECK-ENCODING: [0x41,0x03,0x04,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80040341 + +fmop4a za3.s, {z14.s-z15.s}, z30.s // 10000000-00001110-00000011-11000011 +// CHECK-INST: fmop4a za3.s, { z14.s, z15.s }, z30.s +// CHECK-ENCODING: [0xc3,0x03,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e03c3 + +// Multiple vectors + +fmop4a za0.s, {z0.s-z1.s}, {z16.s-z17.s} // 10000000-00010000-00000010-00000000 +// CHECK-INST: fmop4a za0.s, { z0.s, z1.s }, { z16.s, z17.s } +// CHECK-ENCODING: [0x00,0x02,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80100200 + +fmop4a za1.s, {z10.s-z11.s}, {z20.s-z21.s} // 10000000-00010100-00000011-01000001 +// CHECK-INST: fmop4a za1.s, { z10.s, z11.s }, { z20.s, z21.s } +// CHECK-ENCODING: [0x41,0x03,0x14,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80140341 + +fmop4a za3.s, {z14.s-z15.s}, {z30.s-z31.s} // 10000000-00011110-00000011-11000011 +// CHECK-INST: fmop4a za3.s, { z14.s, z15.s }, { z30.s, z31.s } +// CHECK-ENCODING: [0xc3,0x03,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e03c3 + +// FMOP4S + +// Single vectors + +fmop4s za0.s, z0.s, z16.s // 10000000-00000000-00000000-00010000 +// CHECK-INST: fmop4s za0.s, z0.s, z16.s +// CHECK-ENCODING: [0x10,0x00,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80000010 + +fmop4s za3.s, z12.s, z24.s // 10000000-00001000-00000001-10010011 +// CHECK-INST: fmop4s za3.s, z12.s, z24.s +// CHECK-ENCODING: [0x93,0x01,0x08,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80080193 + +fmop4s za3.s, z14.s, z30.s // 10000000-00001110-00000001-11010011 +// CHECK-INST: fmop4s za3.s, z14.s, z30.s +// CHECK-ENCODING: [0xd3,0x01,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e01d3 + +// Single and multiple vectors + +fmop4s za0.s, z0.s, {z16.s-z17.s} // 10000000-00010000-00000000-00010000 +// CHECK-INST: fmop4s za0.s, z0.s, { z16.s, z17.s } +// CHECK-ENCODING: [0x10,0x00,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80100010 + +fmop4s za1.s, z10.s, {z20.s-z21.s} // 10000000-00010100-00000001-01010001 +// CHECK-INST: fmop4s za1.s, z10.s, { z20.s, z21.s } +// CHECK-ENCODING: [0x51,0x01,0x14,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80140151 + +fmop4s za3.s, z14.s, {z30.s-z31.s} // 10000000-00011110-00000001-11010011 +// CHECK-INST: fmop4s za3.s, z14.s, { z30.s, z31.s } +// CHECK-ENCODING: [0xd3,0x01,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e01d3 + +// Multiple and single vectors + +fmop4s za0.s, {z0.s-z1.s}, z16.s // 10000000-00000000-00000010-00010000 +// CHECK-INST: fmop4s za0.s, { z0.s, z1.s }, z16.s +// CHECK-ENCODING: [0x10,0x02,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80000210 + +fmop4s za1.s, {z10.s-z11.s}, z20.s // 10000000-00000100-00000011-01010001 +// CHECK-INST: fmop4s za1.s, { z10.s, z11.s }, z20.s +// CHECK-ENCODING: [0x51,0x03,0x04,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80040351 + +fmop4s za3.s, {z14.s-z15.s}, z30.s // 10000000-00001110-00000011-11010011 +// CHECK-INST: fmop4s za3.s, { z14.s, z15.s }, z30.s +// CHECK-ENCODING: [0xd3,0x03,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e03d3 + +// Multiple vectors + +fmop4s za0.s, {z0.s-z1.s}, {z16.s-z17.s} // 10000000-00010000-00000010-00010000 +// CHECK-INST: fmop4s za0.s, { z0.s, z1.s }, { z16.s, z17.s } +// CHECK-ENCODING: [0x10,0x02,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80100210 + +fmop4s za1.s, {z10.s-z11.s}, {z20.s-z21.s} // 10000000-00010100-00000011-01010001 +// CHECK-INST: fmop4s za1.s, { z10.s, z11.s }, { z20.s, z21.s } +// CHECK-ENCODING: [0x51,0x03,0x14,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80140351 + +fmop4s za3.s, {z14.s-z15.s}, {z30.s-z31.s} // 10000000-00011110-00000011-11010011 +// CHECK-INST: fmop4s za3.s, { z14.s, z15.s }, { z30.s, z31.s } +// CHECK-ENCODING: [0xd3,0x03,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e03d3 diff --git a/llvm/test/MC/AArch64/SME2p2/fmop4as-fp64-non-widening-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp64-non-widening-diagnostics.s new file mode 100644 index 0000000000000..ff9602bc12afc --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp64-non-widening-diagnostics.s @@ -0,0 +1,243 @@ +// RUN: not llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-f64f64 < %s 2>&1 | FileCheck %s + +// FMOP4A + +// Single vectors + +fmop4a za0.s, z0.d, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4a za8.d, z0.d, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, z0.s, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4a za0.d, z15.d, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4a za0.d, z16.d, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4a za0.d, z0.d, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4a za0.d, z12.d, z17.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4a za0.d, z12.d, z14.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4a za0.d, z12.d, z31.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +// Single and multiple vectors + +fmop4a za0.s, z0.d, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4a za8.d, z0.d, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, z0.s, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4a za0.d, z1.d, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4a za0.d, z16.d, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4a za0.d, z0.d, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, z0.d, {z17.d-z18.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.d, z0.d, {z16.d-z18.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, z0.d, {z12.d-z13.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// Multiple and single vectors + +fmop4a za0.s, {z0.d-z1.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za8.d, {z0.d-z1.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, {z0.s-z1.s}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, {z1.d-z2.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.d, {z0.d-z2.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, {z16.d-z17.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.d, {z0.d-z1.d}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4a za0.d, {z0.d-z1.d}, z17.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4a za0.d, {z0.d-z1.d}, z12.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +// Multiple vectors + +fmop4a za0.s, {z0.d-z1.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za8.d, {z0.d-z1.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, {z0.s-z1.s}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, {z1.d-z2.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.d, {z0.d-z2.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, {z18.d-z19.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.d, {z0.d-z1.d}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, {z0.d-z1.d}, {z19.d-z20.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4a za0.d, {z0.d-z1.d}, {z16.d-z18.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4a za0.d, {z0.d-z1.d}, {z10.d-z11.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// FMOP4S + +// Single vectors + +fmop4s za0.s, z0.d, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4s za8.d, z0.d, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, z0.s, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4s za0.d, z15.d, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4s za0.d, z16.d, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4s za0.d, z0.d, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4s za0.d, z12.d, z17.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4s za0.d, z12.d, z14.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4s za0.d, z12.d, z31.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +// Single and multiple vectors + +fmop4s za0.s, z0.d, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.s..z14.s + +fmop4s za8.d, z0.d, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, z0.s, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4s za0.d, z1.d, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4s za0.d, z16.d, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.d..z14.d + +fmop4s za0.d, z0.d, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, z0.d, {z17.d-z18.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.d, z0.d, {z16.d-z18.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, z0.d, {z12.d-z13.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +// Multiple and single vectors + +fmop4s za0.s, {z0.d-z1.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za8.d, {z0.d-z1.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, {z0.s-z1.s}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, {z1.d-z2.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.d, {z0.d-z2.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, {z16.d-z17.d}, z16.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.d, {z0.d-z1.d}, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4s za0.d, {z0.d-z1.d}, z17.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +fmop4s za0.d, {z0.d-z1.d}, z12.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.d..z30.d + +// Multiple vectors + +fmop4s za0.s, {z0.d-z1.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za8.d, {z0.d-z1.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, {z0.s-z1.s}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, {z1.d-z2.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.d, {z0.d-z2.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, {z18.d-z19.d}, {z16.d-z17.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.d, {z0.d-z1.d}, {z16.s-z17.s} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, {z0.d-z1.d}, {z19.d-z20.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types + +fmop4s za0.d, {z0.d-z1.d}, {z16.d-z18.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +fmop4s za0.d, {z0.d-z1.d}, {z10.d-z11.d} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types diff --git a/llvm/test/MC/AArch64/SME2p2/fmop4as-fp64-non-widening.s b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp64-non-widening.s new file mode 100644 index 0000000000000..b0ad2984ad5ac --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/fmop4as-fp64-non-widening.s @@ -0,0 +1,180 @@ + +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-f64f64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-f64f64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-f64f64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-f64f64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-f64f64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-f64f64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + + +// FMOP4A + +// Single vectors + +fmop4a za0.d, z0.d, z16.d // 10000000-11000000-00000000-00001000 +// CHECK-INST: fmop4a za0.d, z0.d, z16.d +// CHECK-ENCODING: [0x08,0x00,0xc0,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80c00008 + +fmop4a za5.d, z10.d, z20.d // 10000000-11000100-00000001-01001101 +// CHECK-INST: fmop4a za5.d, z10.d, z20.d +// CHECK-ENCODING: [0x4d,0x01,0xc4,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80c4014d + +fmop4a za7.d, z14.d, z30.d // 10000000-11001110-00000001-11001111 +// CHECK-INST: fmop4a za7.d, z14.d, z30.d +// CHECK-ENCODING: [0xcf,0x01,0xce,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80ce01cf + +// Single and multiple vectors + +fmop4a za0.d, z0.d, {z16.d-z17.d} // 10000000-11010000-00000000-00001000 +// CHECK-INST: fmop4a za0.d, z0.d, { z16.d, z17.d } +// CHECK-ENCODING: [0x08,0x00,0xd0,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80d00008 + +fmop4a za5.d, z10.d, {z20.d-z21.d} // 10000000-11010100-00000001-01001101 +// CHECK-INST: fmop4a za5.d, z10.d, { z20.d, z21.d } +// CHECK-ENCODING: [0x4d,0x01,0xd4,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80d4014d + +fmop4a za7.d, z14.d, {z30.d-z31.d} // 10000000-11011110-00000001-11001111 +// CHECK-INST: fmop4a za7.d, z14.d, { z30.d, z31.d } +// CHECK-ENCODING: [0xcf,0x01,0xde,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80de01cf + +// Multiple and single vectors + +fmop4a za0.d, {z0.d-z1.d}, z16.d // 10000000-11000000-00000010-00001000 +// CHECK-INST: fmop4a za0.d, { z0.d, z1.d }, z16.d +// CHECK-ENCODING: [0x08,0x02,0xc0,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80c00208 + +fmop4a za5.d, {z10.d-z11.d}, z20.d // 10000000-11000100-00000011-01001101 +// CHECK-INST: fmop4a za5.d, { z10.d, z11.d }, z20.d +// CHECK-ENCODING: [0x4d,0x03,0xc4,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80c4034d + +fmop4a za7.d, {z14.d-z15.d}, z30.d // 10000000-11001110-00000011-11001111 +// CHECK-INST: fmop4a za7.d, { z14.d, z15.d }, z30.d +// CHECK-ENCODING: [0xcf,0x03,0xce,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80ce03cf + +// Multiple vectors + +fmop4a za0.d, {z0.d-z1.d}, {z16.d-z17.d} // 10000000-11010000-00000010-00001000 +// CHECK-INST: fmop4a za0.d, { z0.d, z1.d }, { z16.d, z17.d } +// CHECK-ENCODING: [0x08,0x02,0xd0,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80d00208 + +fmop4a za5.d, {z10.d-z11.d}, {z20.d-z21.d} // 10000000-11010100-00000011-01001101 +// CHECK-INST: fmop4a za5.d, { z10.d, z11.d }, { z20.d, z21.d } +// CHECK-ENCODING: [0x4d,0x03,0xd4,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80d4034d + +fmop4a za7.d, {z14.d-z15.d}, {z30.d-z31.d} // 10000000-11011110-00000011-11001111 +// CHECK-INST: fmop4a za7.d, { z14.d, z15.d }, { z30.d, z31.d } +// CHECK-ENCODING: [0xcf,0x03,0xde,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80de03cf + + +// FMOP4S + +// Single vectors + +fmop4s za0.d, z0.d, z16.d // 10000000-11000000-00000000-00011000 +// CHECK-INST: fmop4s za0.d, z0.d, z16.d +// CHECK-ENCODING: [0x18,0x00,0xc0,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80c00018 + +fmop4s za5.d, z10.d, z20.d // 10000000-11000100-00000001-01011101 +// CHECK-INST: fmop4s za5.d, z10.d, z20.d +// CHECK-ENCODING: [0x5d,0x01,0xc4,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80c4015d + +fmop4s za7.d, z14.d, z30.d // 10000000-11001110-00000001-11011111 +// CHECK-INST: fmop4s za7.d, z14.d, z30.d +// CHECK-ENCODING: [0xdf,0x01,0xce,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80ce01df + +// Single and multiple vectors + +fmop4s za0.d, z0.d, {z16.d-z17.d} // 10000000-11010000-00000000-00011000 +// CHECK-INST: fmop4s za0.d, z0.d, { z16.d, z17.d } +// CHECK-ENCODING: [0x18,0x00,0xd0,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80d00018 + +fmop4s za5.d, z10.d, {z20.d-z21.d} // 10000000-11010100-00000001-01011101 +// CHECK-INST: fmop4s za5.d, z10.d, { z20.d, z21.d } +// CHECK-ENCODING: [0x5d,0x01,0xd4,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80d4015d + +fmop4s za7.d, z14.d, {z30.d-z31.d} // 10000000-11011110-00000001-11011111 +// CHECK-INST: fmop4s za7.d, z14.d, { z30.d, z31.d } +// CHECK-ENCODING: [0xdf,0x01,0xde,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80de01df + +// Multiple and single vectors + +fmop4s za0.d, {z0.d-z1.d}, z16.d // 10000000-11000000-00000010-00011000 +// CHECK-INST: fmop4s za0.d, { z0.d, z1.d }, z16.d +// CHECK-ENCODING: [0x18,0x02,0xc0,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80c00218 + +fmop4s za5.d, {z10.d-z11.d}, z20.d // 10000000-11000100-00000011-01011101 +// CHECK-INST: fmop4s za5.d, { z10.d, z11.d }, z20.d +// CHECK-ENCODING: [0x5d,0x03,0xc4,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80c4035d + +fmop4s za7.d, {z14.d-z15.d}, z30.d // 10000000-11001110-00000011-11011111 +// CHECK-INST: fmop4s za7.d, { z14.d, z15.d }, z30.d +// CHECK-ENCODING: [0xdf,0x03,0xce,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80ce03df + +// Multiple vectors + +fmop4s za0.d, {z0.d-z1.d}, {z16.d-z17.d} // 10000000-11010000-00000010-00011000 +// CHECK-INST: fmop4s za0.d, { z0.d, z1.d }, { z16.d, z17.d } +// CHECK-ENCODING: [0x18,0x02,0xd0,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80d00218 + +fmop4s za5.d, {z10.d-z11.d}, {z20.d-z21.d} // 10000000-11010100-00000011-01011101 +// CHECK-INST: fmop4s za5.d, { z10.d, z11.d }, { z20.d, z21.d } +// CHECK-ENCODING: [0x5d,0x03,0xd4,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80d4035d + +fmop4s za7.d, {z14.d-z15.d}, {z30.d-z31.d} // 10000000-11011110-00000011-11011111 +// CHECK-INST: fmop4s za7.d, { z14.d, z15.d }, { z30.d, z31.d } +// CHECK-ENCODING: [0xdf,0x03,0xde,0x80] +// CHECK-ERROR: instruction requires: sme2p2 sme-f64f64 +// CHECK-UNKNOWN: 80de03df diff --git a/llvm/test/MC/AArch64/SME2p2/smop4a-16to32.s b/llvm/test/MC/AArch64/SME2p2/smop4a-16to32.s new file mode 100644 index 0000000000000..fe4de7307ec50 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/smop4a-16to32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +smop4a za0.s, z0.h, z16.h // 10000000-00000000-10000000-00001000 +// CHECK-INST: smop4a za0.s, z0.h, z16.h +// CHECK-ENCODING: [0x08,0x80,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80008008 + +smop4a za3.s, z12.h, z24.h // 10000000-00001000-10000001-10001011 +// CHECK-INST: smop4a za3.s, z12.h, z24.h +// CHECK-ENCODING: [0x8b,0x81,0x08,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8008818b + +smop4a za3.s, z14.h, z30.h // 10000000-00001110-10000001-11001011 +// CHECK-INST: smop4a za3.s, z14.h, z30.h +// CHECK-ENCODING: [0xcb,0x81,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e81cb + +smop4a za0.s, z0.h, {z16.h-z17.h} // 10000000-00010000-10000000-00001000 +// CHECK-INST: smop4a za0.s, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x80,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80108008 + +smop4a za3.s, z12.h, {z24.h-z25.h} // 10000000-00011000-10000001-10001011 +// CHECK-INST: smop4a za3.s, z12.h, { z24.h, z25.h } +// CHECK-ENCODING: [0x8b,0x81,0x18,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8018818b + +smop4a za3.s, z14.h, {z30.h-z31.h} // 10000000-00011110-10000001-11001011 +// CHECK-INST: smop4a za3.s, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xcb,0x81,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e81cb + +smop4a za0.s, {z0.h-z1.h}, z16.h // 10000000-00000000-10000010-00001000 +// CHECK-INST: smop4a za0.s, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x08,0x82,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80008208 + +smop4a za3.s, {z12.h-z13.h}, z24.h // 10000000-00001000-10000011-10001011 +// CHECK-INST: smop4a za3.s, { z12.h, z13.h }, z24.h +// CHECK-ENCODING: [0x8b,0x83,0x08,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8008838b + +smop4a za3.s, {z14.h-z15.h}, z30.h // 10000000-00001110-10000011-11001011 +// CHECK-INST: smop4a za3.s, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xcb,0x83,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e83cb + +smop4a za0.s, {z0.h-z1.h}, {z16.h-z17.h} // 10000000-00010000-10000010-00001000 +// CHECK-INST: smop4a za0.s, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x82,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80108208 + +smop4a za3.s, {z12.h-z13.h}, {z24.h-z25.h} // 10000000-00011000-10000011-10001011 +// CHECK-INST: smop4a za3.s, { z12.h, z13.h }, { z24.h, z25.h } +// CHECK-ENCODING: [0x8b,0x83,0x18,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8018838b + +smop4a za3.s, {z14.h-z15.h}, {z30.h-z31.h} // 10000000-00011110-10000011-11001011 +// CHECK-INST: smop4a za3.s, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xcb,0x83,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e83cb diff --git a/llvm/test/MC/AArch64/SME2p2/smop4a-64.s b/llvm/test/MC/AArch64/SME2p2/smop4a-64.s new file mode 100644 index 0000000000000..be9b6aaa47ecc --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/smop4a-64.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-i16i64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-i16i64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +smop4a za0.d, z0.h, z16.h // 10100000-11000000-00000000-00001000 +// CHECK-INST: smop4a za0.d, z0.h, z16.h +// CHECK-ENCODING: [0x08,0x00,0xc0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0c00008 + +smop4a za5.d, z10.h, z20.h // 10100000-11000100-00000001-01001101 +// CHECK-INST: smop4a za5.d, z10.h, z20.h +// CHECK-ENCODING: [0x4d,0x01,0xc4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0c4014d + +smop4a za7.d, z14.h, z30.h // 10100000-11001110-00000001-11001111 +// CHECK-INST: smop4a za7.d, z14.h, z30.h +// CHECK-ENCODING: [0xcf,0x01,0xce,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0ce01cf + +smop4a za0.d, z0.h, {z16.h-z17.h} // 10100000-11010000-00000000-00001000 +// CHECK-INST: smop4a za0.d, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x00,0xd0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0d00008 + +smop4a za5.d, z10.h, {z20.h-z21.h} // 10100000-11010100-00000001-01001101 +// CHECK-INST: smop4a za5.d, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x4d,0x01,0xd4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0d4014d + +smop4a za7.d, z14.h, {z30.h-z31.h} // 10100000-11011110-00000001-11001111 +// CHECK-INST: smop4a za7.d, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xcf,0x01,0xde,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0de01cf + +smop4a za0.d, {z0.h-z1.h}, z16.h // 10100000-11000000-00000010-00001000 +// CHECK-INST: smop4a za0.d, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x08,0x02,0xc0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0c00208 + +smop4a za5.d, {z10.h-z11.h}, z20.h // 10100000-11000100-00000011-01001101 +// CHECK-INST: smop4a za5.d, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x4d,0x03,0xc4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0c4034d + +smop4a za7.d, {z14.h-z15.h}, z30.h // 10100000-11001110-00000011-11001111 +// CHECK-INST: smop4a za7.d, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xcf,0x03,0xce,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0ce03cf + +smop4a za0.d, {z0.h-z1.h}, {z16.h-z17.h} // 10100000-11010000-00000010-00001000 +// CHECK-INST: smop4a za0.d, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x02,0xd0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0d00208 + +smop4a za5.d, {z10.h-z11.h}, {z20.h-z21.h} // 10100000-11010100-00000011-01001101 +// CHECK-INST: smop4a za5.d, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x4d,0x03,0xd4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0d4034d + +smop4a za7.d, {z14.h-z15.h}, {z30.h-z31.h} // 10100000-11011110-00000011-11001111 +// CHECK-INST: smop4a za7.d, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xcf,0x03,0xde,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0de03cf diff --git a/llvm/test/MC/AArch64/SME2p2/smop4a-8to32.s b/llvm/test/MC/AArch64/SME2p2/smop4a-8to32.s new file mode 100644 index 0000000000000..0615c8fc690d8 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/smop4a-8to32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +smop4a za0.s, z0.b, z16.b // 10000000-00000000-10000000-00000000 +// CHECK-INST: smop4a za0.s, z0.b, z16.b +// CHECK-ENCODING: [0x00,0x80,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80008000 + +smop4a za1.s, z10.b, z20.b // 10000000-00000100-10000001-01000001 +// CHECK-INST: smop4a za1.s, z10.b, z20.b +// CHECK-ENCODING: [0x41,0x81,0x04,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80048141 + +smop4a za3.s, z14.b, z30.b // 10000000-00001110-10000001-11000011 +// CHECK-INST: smop4a za3.s, z14.b, z30.b +// CHECK-ENCODING: [0xc3,0x81,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e81c3 + +smop4a za0.s, z0.b, {z16.b-z17.b} // 10000000-00010000-10000000-00000000 +// CHECK-INST: smop4a za0.s, z0.b, { z16.b, z17.b } +// CHECK-ENCODING: [0x00,0x80,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80108000 + +smop4a za3.s, z12.b, {z24.b-z25.b} // 10000000-00011000-10000001-10000011 +// CHECK-INST: smop4a za3.s, z12.b, { z24.b, z25.b } +// CHECK-ENCODING: [0x83,0x81,0x18,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80188183 + +smop4a za3.s, z14.b, {z30.b-z31.b} // 10000000-00011110-10000001-11000011 +// CHECK-INST: smop4a za3.s, z14.b, { z30.b, z31.b } +// CHECK-ENCODING: [0xc3,0x81,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e81c3 + +smop4a za0.s, {z0.b-z1.b}, z16.b // 10000000-00000000-10000010-00000000 +// CHECK-INST: smop4a za0.s, { z0.b, z1.b }, z16.b +// CHECK-ENCODING: [0x00,0x82,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80008200 + +smop4a za1.s, {z10.b-z11.b}, z20.b // 10000000-00000100-10000011-01000001 +// CHECK-INST: smop4a za1.s, { z10.b, z11.b }, z20.b +// CHECK-ENCODING: [0x41,0x83,0x04,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80048341 + +smop4a za3.s, {z14.b-z15.b}, z30.b // 10000000-00001110-10000011-11000011 +// CHECK-INST: smop4a za3.s, { z14.b, z15.b }, z30.b +// CHECK-ENCODING: [0xc3,0x83,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e83c3 + +smop4a za0.s, {z0.b-z1.b}, {z16.b-z17.b} // 10000000-00010000-10000010-00000000 +// CHECK-INST: smop4a za0.s, { z0.b, z1.b }, { z16.b, z17.b } +// CHECK-ENCODING: [0x00,0x82,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80108200 + +smop4a za1.s, {z10.b-z11.b}, {z20.b-z21.b} // 10000000-00010100-10000011-01000001 +// CHECK-INST: smop4a za1.s, { z10.b, z11.b }, { z20.b, z21.b } +// CHECK-ENCODING: [0x41,0x83,0x14,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80148341 + +smop4a za3.s, {z14.b-z15.b}, {z30.b-z31.b} // 10000000-00011110-10000011-11000011 +// CHECK-INST: smop4a za3.s, { z14.b, z15.b }, { z30.b, z31.b } +// CHECK-ENCODING: [0xc3,0x83,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e83c3 diff --git a/llvm/test/MC/AArch64/SME2p2/smop4a-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/smop4a-diagnostics.s new file mode 100644 index 0000000000000..01ddbe135c948 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/smop4a-diagnostics.s @@ -0,0 +1,82 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid tile +// +// expected: .s => za0-za3, .d => za0-za7 + +smop4a za4.s, z0.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: smop4a za4.s, z0.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za4.s, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: smop4a za4.s, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za8.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: smop4a za8.d, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid first operand (expected z0..z15) + +smop4a za0.d, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: smop4a za0.d, z16.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za0.s, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: smop4a za0.s, {z16.h-z17.h}, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za0.s, z16.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.b..z14.b +// CHECK-NEXT: smop4a za0.s, z16.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid second operand (expected z16..z31) + +smop4a za0.d, z14.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: smop4a za0.d, z14.h, z14.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za0.s, z14.h, {z14.h-z15.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: smop4a za0.s, z14.h, {z14.h-z15.h} +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za0.s, z14.b, z14.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: smop4a za0.s, z14.b, z14.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid ZPR type suffix +// +// expected: .s => .b, .s => .h, .d => .h + +smop4a za3.s, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: smop4a za3.s, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za3.s, z0.b, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: smop4a za3.s, z0.b, z16.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za3.d, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: smop4a za3.d, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4a za3.d, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: smop4a za3.d, z0.s, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/smop4s-16to32.s b/llvm/test/MC/AArch64/SME2p2/smop4s-16to32.s new file mode 100644 index 0000000000000..41828c97321a8 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/smop4s-16to32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +smop4s za0.s, z0.h, z16.h // 10000000-00000000-10000000-00011000 +// CHECK-INST: smop4s za0.s, z0.h, z16.h +// CHECK-ENCODING: [0x18,0x80,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80008018 + +smop4s za3.s, z12.h, z24.h // 10000000-00001000-10000001-10011011 +// CHECK-INST: smop4s za3.s, z12.h, z24.h +// CHECK-ENCODING: [0x9b,0x81,0x08,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8008819b + +smop4s za3.s, z14.h, z30.h // 10000000-00001110-10000001-11011011 +// CHECK-INST: smop4s za3.s, z14.h, z30.h +// CHECK-ENCODING: [0xdb,0x81,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e81db + +smop4s za0.s, z0.h, {z16.h-z17.h} // 10000000-00010000-10000000-00011000 +// CHECK-INST: smop4s za0.s, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x80,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80108018 + +smop4s za3.s, z12.h, {z24.h-z25.h} // 10000000-00011000-10000001-10011011 +// CHECK-INST: smop4s za3.s, z12.h, { z24.h, z25.h } +// CHECK-ENCODING: [0x9b,0x81,0x18,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8018819b + +smop4s za3.s, z14.h, {z30.h-z31.h} // 10000000-00011110-10000001-11011011 +// CHECK-INST: smop4s za3.s, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xdb,0x81,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e81db + +smop4s za0.s, {z0.h-z1.h}, z16.h // 10000000-00000000-10000010-00011000 +// CHECK-INST: smop4s za0.s, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x18,0x82,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80008218 + +smop4s za3.s, {z12.h-z13.h}, z24.h // 10000000-00001000-10000011-10011011 +// CHECK-INST: smop4s za3.s, { z12.h, z13.h }, z24.h +// CHECK-ENCODING: [0x9b,0x83,0x08,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8008839b + +smop4s za3.s, {z14.h-z15.h}, z30.h // 10000000-00001110-10000011-11011011 +// CHECK-INST: smop4s za3.s, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xdb,0x83,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e83db + +smop4s za0.s, {z0.h-z1.h}, {z16.h-z17.h} // 10000000-00010000-10000010-00011000 +// CHECK-INST: smop4s za0.s, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x82,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80108218 + +smop4s za3.s, {z12.h-z13.h}, {z24.h-z25.h} // 10000000-00011000-10000011-10011011 +// CHECK-INST: smop4s za3.s, { z12.h, z13.h }, { z24.h, z25.h } +// CHECK-ENCODING: [0x9b,0x83,0x18,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8018839b + +smop4s za3.s, {z14.h-z15.h}, {z30.h-z31.h} // 10000000-00011110-10000011-11011011 +// CHECK-INST: smop4s za3.s, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xdb,0x83,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e83db diff --git a/llvm/test/MC/AArch64/SME2p2/smop4s-64.s b/llvm/test/MC/AArch64/SME2p2/smop4s-64.s new file mode 100644 index 0000000000000..1b2e2637f0119 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/smop4s-64.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-i16i64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-i16i64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +smop4s za0.d, z0.h, z16.h // 10100000-11000000-00000000-00011000 +// CHECK-INST: smop4s za0.d, z0.h, z16.h +// CHECK-ENCODING: [0x18,0x00,0xc0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0c00018 + +smop4s za5.d, z10.h, z20.h // 10100000-11000100-00000001-01011101 +// CHECK-INST: smop4s za5.d, z10.h, z20.h +// CHECK-ENCODING: [0x5d,0x01,0xc4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0c4015d + +smop4s za7.d, z14.h, z30.h // 10100000-11001110-00000001-11011111 +// CHECK-INST: smop4s za7.d, z14.h, z30.h +// CHECK-ENCODING: [0xdf,0x01,0xce,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0ce01df + +smop4s za0.d, z0.h, {z16.h-z17.h} // 10100000-11010000-00000000-00011000 +// CHECK-INST: smop4s za0.d, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x00,0xd0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0d00018 + +smop4s za5.d, z10.h, {z20.h-z21.h} // 10100000-11010100-00000001-01011101 +// CHECK-INST: smop4s za5.d, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x5d,0x01,0xd4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0d4015d + +smop4s za7.d, z14.h, {z30.h-z31.h} // 10100000-11011110-00000001-11011111 +// CHECK-INST: smop4s za7.d, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xdf,0x01,0xde,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0de01df + +smop4s za0.d, {z0.h-z1.h}, z16.h // 10100000-11000000-00000010-00011000 +// CHECK-INST: smop4s za0.d, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x18,0x02,0xc0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0c00218 + +smop4s za5.d, {z10.h-z11.h}, z20.h // 10100000-11000100-00000011-01011101 +// CHECK-INST: smop4s za5.d, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x5d,0x03,0xc4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0c4035d + +smop4s za7.d, {z14.h-z15.h}, z30.h // 10100000-11001110-00000011-11011111 +// CHECK-INST: smop4s za7.d, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xdf,0x03,0xce,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0ce03df + +smop4s za0.d, {z0.h-z1.h}, {z16.h-z17.h} // 10100000-11010000-00000010-00011000 +// CHECK-INST: smop4s za0.d, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x02,0xd0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0d00218 + +smop4s za5.d, {z10.h-z11.h}, {z20.h-z21.h} // 10100000-11010100-00000011-01011101 +// CHECK-INST: smop4s za5.d, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x5d,0x03,0xd4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0d4035d + +smop4s za7.d, {z14.h-z15.h}, {z30.h-z31.h} // 10100000-11011110-00000011-11011111 +// CHECK-INST: smop4s za7.d, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xdf,0x03,0xde,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0de03df diff --git a/llvm/test/MC/AArch64/SME2p2/smop4s-8to32.s b/llvm/test/MC/AArch64/SME2p2/smop4s-8to32.s new file mode 100644 index 0000000000000..43fbda9172866 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/smop4s-8to32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +smop4s za0.s, z0.b, z16.b // 10000000-00000000-10000000-00010000 +// CHECK-INST: smop4s za0.s, z0.b, z16.b +// CHECK-ENCODING: [0x10,0x80,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80008010 + +smop4s za1.s, z10.b, z20.b // 10000000-00000100-10000001-01010001 +// CHECK-INST: smop4s za1.s, z10.b, z20.b +// CHECK-ENCODING: [0x51,0x81,0x04,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80048151 + +smop4s za3.s, z14.b, z30.b // 10000000-00001110-10000001-11010011 +// CHECK-INST: smop4s za3.s, z14.b, z30.b +// CHECK-ENCODING: [0xd3,0x81,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e81d3 + +smop4s za0.s, z0.b, {z16.b-z17.b} // 10000000-00010000-10000000-00010000 +// CHECK-INST: smop4s za0.s, z0.b, { z16.b, z17.b } +// CHECK-ENCODING: [0x10,0x80,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80108010 + +smop4s za1.s, z10.b, {z20.b-z21.b} // 10000000-00010100-10000001-01010001 +// CHECK-INST: smop4s za1.s, z10.b, { z20.b, z21.b } +// CHECK-ENCODING: [0x51,0x81,0x14,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80148151 + +smop4s za3.s, z14.b, {z30.b-z31.b} // 10000000-00011110-10000001-11010011 +// CHECK-INST: smop4s za3.s, z14.b, { z30.b, z31.b } +// CHECK-ENCODING: [0xd3,0x81,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e81d3 + +smop4s za0.s, {z0.b-z1.b}, z16.b // 10000000-00000000-10000010-00010000 +// CHECK-INST: smop4s za0.s, { z0.b, z1.b }, z16.b +// CHECK-ENCODING: [0x10,0x82,0x00,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80008210 + +smop4s za1.s, {z10.b-z11.b}, z20.b // 10000000-00000100-10000011-01010001 +// CHECK-INST: smop4s za1.s, { z10.b, z11.b }, z20.b +// CHECK-ENCODING: [0x51,0x83,0x04,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80048351 + +smop4s za3.s, {z14.b-z15.b}, z30.b // 10000000-00001110-10000011-11010011 +// CHECK-INST: smop4s za3.s, { z14.b, z15.b }, z30.b +// CHECK-ENCODING: [0xd3,0x83,0x0e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 800e83d3 + +smop4s za0.s, {z0.b-z1.b}, {z16.b-z17.b} // 10000000-00010000-10000010-00010000 +// CHECK-INST: smop4s za0.s, { z0.b, z1.b }, { z16.b, z17.b } +// CHECK-ENCODING: [0x10,0x82,0x10,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80108210 + +smop4s za1.s, {z10.b-z11.b}, {z20.b-z21.b} // 10000000-00010100-10000011-01010001 +// CHECK-INST: smop4s za1.s, { z10.b, z11.b }, { z20.b, z21.b } +// CHECK-ENCODING: [0x51,0x83,0x14,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80148351 + +smop4s za3.s, {z14.b-z15.b}, {z30.b-z31.b} // 10000000-00011110-10000011-11010011 +// CHECK-INST: smop4s za3.s, { z14.b, z15.b }, { z30.b, z31.b } +// CHECK-ENCODING: [0xd3,0x83,0x1e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 801e83d3 diff --git a/llvm/test/MC/AArch64/SME2p2/smop4s-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/smop4s-diagnostics.s new file mode 100644 index 0000000000000..a11402c8c7be6 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/smop4s-diagnostics.s @@ -0,0 +1,82 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid tile +// +// expected: .s => za0-za3, .d => za0-za7 + +smop4s za4.s, z0.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: smop4s za4.s, z0.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za4.s, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: smop4s za4.s, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za8.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: smop4s za8.d, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid first operand (expected z0..z15) + +smop4s za0.d, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: smop4s za0.d, z16.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za0.s, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: smop4s za0.s, {z16.h-z17.h}, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za0.s, z16.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.b..z14.b +// CHECK-NEXT: smop4s za0.s, z16.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid second operand (expected z16..z31) + +smop4s za0.d, z14.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: smop4s za0.d, z14.h, z14.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za0.s, z14.h, {z14.h-z15.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: smop4s za0.s, z14.h, {z14.h-z15.h} +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za0.s, z14.b, z14.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: smop4s za0.s, z14.b, z14.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid ZPR type suffix +// +// expected: .s => .b, s => .h, .d => .h + +smop4s za3.s, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: smop4s za3.s, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za3.s, z0.b, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: smop4s za3.s, z0.b, z16.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za3.d, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: smop4s za3.d, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +smop4s za3.d, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: smop4s za3.d, z0.s, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/stmopa-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/stmopa-diagnostics.s new file mode 100644 index 0000000000000..db395376d7ceb --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/stmopa-diagnostics.s @@ -0,0 +1,100 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 2>&1 < %s| FileCheck %s + +// --------------------------------------------------------------------------// +// Invalid ZA register + +stmopa za4.s, {z30.b-z31.b}, z31.b, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stmopa za4.s, {z30.b-z31.b}, z31.b, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za4.s, {z30.h-z31.h}, z31.h, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stmopa za4.s, {z30.h-z31.h}, z31.h, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid vector list operand + +stmopa za3.s, {z29.b-z30.b}, z31.b, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: stmopa za3.s, {z29.b-z30.b}, z31.b, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.s, {z29.h-z30.h}, z31.h, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: stmopa za3.s, {z29.h-z30.h}, z31.h, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid ZK register +stmopa za3.s, {z28.b-z29.b}, z31.b, z19[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: stmopa za3.s, {z28.b-z29.b}, z31.b, z19[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.s, {z28.b-z29.b}, z31.b, z24[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: stmopa za3.s, {z28.b-z29.b}, z31.b, z24[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.s, {z28.b-z29.b}, z31.b, z27[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: stmopa za3.s, {z28.b-z29.b}, z31.b, z27[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.s, {z28.h-z29.h}, z31.h, z19[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: stmopa za3.s, {z28.h-z29.h}, z31.h, z19[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.s, {z28.h-z29.h}, z31.h, z24[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: stmopa za3.s, {z28.h-z29.h}, z31.h, z24[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.s, {z28.h-z29.h}, z31.h, z27[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: stmopa za3.s, {z28.h-z29.h}, z31.h, z27[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid immediate + +stmopa za3.s, {z28.b-z29.b}, z31.b, z20[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +// CHECK-NEXT: stmopa za3.s, {z28.b-z29.b}, z31.b, z20[4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.s, {z28.h-z29.h}, z31.h, z20[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +// CHECK-NEXT: stmopa za3.s, {z28.h-z29.h}, z31.h, z20[4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid ZPR type suffix + +stmopa za0.h, {z28.b-z29.b}, z31.b, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: stmopa za0.h, {z28.b-z29.b}, z31.b, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za0.h, {z28.h-z29.h}, z31.h, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: stmopa za0.h, {z28.h-z29.h}, z31.h, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.s, {z28.s-z29.s}, z31.s, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: stmopa za3.s, {z28.s-z29.s}, z31.s, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.d, {z28.s-z29.s}, z31.s, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: stmopa za3.d, {z28.s-z29.s}, z31.s, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +stmopa za3.d, {z28.h-z29.h}, z31.h, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: stmopa za3.d, {z28.h-z29.h}, z31.h, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/stmopa.s b/llvm/test/MC/AArch64/SME2p2/stmopa.s new file mode 100644 index 0000000000000..c641eb46108a7 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/stmopa.s @@ -0,0 +1,49 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +stmopa za0.s, {z0.b-z1.b}, z0.b, z20[0] // 10000000-01000000-10000000-00000000 +// CHECK-INST: stmopa za0.s, { z0.b, z1.b }, z0.b, z20[0] +// CHECK-ENCODING: [0x00,0x80,0x40,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80408000 + +stmopa za3.s, {z12.b-z13.b}, z8.b, z23[3] // 10000000-01001000-10001101-10110011 +// CHECK-INST: stmopa za3.s, { z12.b, z13.b }, z8.b, z23[3] +// CHECK-ENCODING: [0xb3,0x8d,0x48,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80488db3 + +stmopa za3.s, {z30.b-z31.b}, z31.b, z31[3] // 10000000-01011111-10011111-11110011 +// CHECK-INST: stmopa za3.s, { z30.b, z31.b }, z31.b, z31[3] +// CHECK-ENCODING: [0xf3,0x9f,0x5f,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 805f9ff3 + +stmopa za0.s, {z0.h-z1.h}, z0.h, z20[0] // 10000000-01000000-10000000-00001000 +// CHECK-INST: stmopa za0.s, { z0.h, z1.h }, z0.h, z20[0] +// CHECK-ENCODING: [0x08,0x80,0x40,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80408008 + +stmopa za3.s, {z12.h-z13.h}, z8.h, z23[3] // 10000000-01001000-10001101-10111011 +// CHECK-INST: stmopa za3.s, { z12.h, z13.h }, z8.h, z23[3] +// CHECK-ENCODING: [0xbb,0x8d,0x48,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80488dbb + +stmopa za3.s, {z30.h-z31.h}, z31.h, z31[3] // 10000000-01011111-10011111-11111011 +// CHECK-INST: stmopa za3.s, { z30.h, z31.h }, z31.h, z31[3] +// CHECK-ENCODING: [0xfb,0x9f,0x5f,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 805f9ffb diff --git a/llvm/test/MC/AArch64/SME2p2/sumop4a-32.s b/llvm/test/MC/AArch64/SME2p2/sumop4a-32.s new file mode 100644 index 0000000000000..8633eae24f498 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/sumop4a-32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +sumop4a za0.s, z0.b, z16.b // 10000000-00100000-10000000-00000000 +// CHECK-INST: sumop4a za0.s, z0.b, z16.b +// CHECK-ENCODING: [0x00,0x80,0x20,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80208000 + +sumop4a za1.s, z10.b, z20.b // 10000000-00100100-10000001-01000001 +// CHECK-INST: sumop4a za1.s, z10.b, z20.b +// CHECK-ENCODING: [0x41,0x81,0x24,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80248141 + +sumop4a za3.s, z14.b, z30.b // 10000000-00101110-10000001-11000011 +// CHECK-INST: sumop4a za3.s, z14.b, z30.b +// CHECK-ENCODING: [0xc3,0x81,0x2e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 802e81c3 + +sumop4a za0.s, z0.b, {z16.b-z17.b} // 10000000-00110000-10000000-00000000 +// CHECK-INST: sumop4a za0.s, z0.b, { z16.b, z17.b } +// CHECK-ENCODING: [0x00,0x80,0x30,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80308000 + +sumop4a za1.s, z10.b, {z20.b-z21.b} // 10000000-00110100-10000001-01000001 +// CHECK-INST: sumop4a za1.s, z10.b, { z20.b, z21.b } +// CHECK-ENCODING: [0x41,0x81,0x34,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80348141 + +sumop4a za3.s, z14.b, {z30.b-z31.b} // 10000000-00111110-10000001-11000011 +// CHECK-INST: sumop4a za3.s, z14.b, { z30.b, z31.b } +// CHECK-ENCODING: [0xc3,0x81,0x3e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 803e81c3 + +sumop4a za0.s, {z0.b-z1.b}, z16.b // 10000000-00100000-10000010-00000000 +// CHECK-INST: sumop4a za0.s, { z0.b, z1.b }, z16.b +// CHECK-ENCODING: [0x00,0x82,0x20,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80208200 + +sumop4a za1.s, {z10.b-z11.b}, z20.b // 10000000-00100100-10000011-01000001 +// CHECK-INST: sumop4a za1.s, { z10.b, z11.b }, z20.b +// CHECK-ENCODING: [0x41,0x83,0x24,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80248341 + +sumop4a za3.s, {z14.b-z15.b}, z30.b // 10000000-00101110-10000011-11000011 +// CHECK-INST: sumop4a za3.s, { z14.b, z15.b }, z30.b +// CHECK-ENCODING: [0xc3,0x83,0x2e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 802e83c3 + +sumop4a za0.s, {z0.b-z1.b}, {z16.b-z17.b} // 10000000-00110000-10000010-00000000 +// CHECK-INST: sumop4a za0.s, { z0.b, z1.b }, { z16.b, z17.b } +// CHECK-ENCODING: [0x00,0x82,0x30,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80308200 + +sumop4a za1.s, {z10.b-z11.b}, {z20.b-z21.b} // 10000000-00110100-10000011-01000001 +// CHECK-INST: sumop4a za1.s, { z10.b, z11.b }, { z20.b, z21.b } +// CHECK-ENCODING: [0x41,0x83,0x34,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80348341 + +sumop4a za3.s, {z14.b-z15.b}, {z30.b-z31.b} // 10000000-00111110-10000011-11000011 +// CHECK-INST: sumop4a za3.s, { z14.b, z15.b }, { z30.b, z31.b } +// CHECK-ENCODING: [0xc3,0x83,0x3e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 803e83c3 diff --git a/llvm/test/MC/AArch64/SME2p2/sumop4a-64.s b/llvm/test/MC/AArch64/SME2p2/sumop4a-64.s new file mode 100644 index 0000000000000..53fa6f0892c5c --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/sumop4a-64.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-i16i64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-i16i64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +sumop4a za0.d, z0.h, z16.h // 10100000-11100000-00000000-00001000 +// CHECK-INST: sumop4a za0.d, z0.h, z16.h +// CHECK-ENCODING: [0x08,0x00,0xe0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0e00008 + +sumop4a za5.d, z10.h, z20.h // 10100000-11100100-00000001-01001101 +// CHECK-INST: sumop4a za5.d, z10.h, z20.h +// CHECK-ENCODING: [0x4d,0x01,0xe4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0e4014d + +sumop4a za7.d, z14.h, z30.h // 10100000-11101110-00000001-11001111 +// CHECK-INST: sumop4a za7.d, z14.h, z30.h +// CHECK-ENCODING: [0xcf,0x01,0xee,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0ee01cf + +sumop4a za0.d, z0.h, {z16.h-z17.h} // 10100000-11110000-00000000-00001000 +// CHECK-INST: sumop4a za0.d, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x00,0xf0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0f00008 + +sumop4a za5.d, z10.h, {z20.h-z21.h} // 10100000-11110100-00000001-01001101 +// CHECK-INST: sumop4a za5.d, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x4d,0x01,0xf4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0f4014d + +sumop4a za7.d, z14.h, {z30.h-z31.h} // 10100000-11111110-00000001-11001111 +// CHECK-INST: sumop4a za7.d, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xcf,0x01,0xfe,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0fe01cf + +sumop4a za0.d, {z0.h-z1.h}, z16.h // 10100000-11100000-00000010-00001000 +// CHECK-INST: sumop4a za0.d, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x08,0x02,0xe0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0e00208 + +sumop4a za5.d, {z10.h-z11.h}, z20.h // 10100000-11100100-00000011-01001101 +// CHECK-INST: sumop4a za5.d, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x4d,0x03,0xe4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0e4034d + +sumop4a za7.d, {z14.h-z15.h}, z30.h // 10100000-11101110-00000011-11001111 +// CHECK-INST: sumop4a za7.d, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xcf,0x03,0xee,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0ee03cf + +sumop4a za0.d, {z0.h-z1.h}, {z16.h-z17.h} // 10100000-11110000-00000010-00001000 +// CHECK-INST: sumop4a za0.d, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x02,0xf0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0f00208 + +sumop4a za5.d, {z10.h-z11.h}, {z20.h-z21.h} // 10100000-11110100-00000011-01001101 +// CHECK-INST: sumop4a za5.d, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x4d,0x03,0xf4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0f4034d + +sumop4a za7.d, {z14.h-z15.h}, {z30.h-z31.h} // 10100000-11111110-00000011-11001111 +// CHECK-INST: sumop4a za7.d, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xcf,0x03,0xfe,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0fe03cf diff --git a/llvm/test/MC/AArch64/SME2p2/sumop4a-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/sumop4a-diagnostics.s new file mode 100644 index 0000000000000..456aa1ad71310 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/sumop4a-diagnostics.s @@ -0,0 +1,68 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid tile +// +// expected: .s => za0-za3, .d => za0-za7 + +sumop4a za4.s, z0.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: sumop4a za4.s, z0.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4a za8.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: sumop4a za8.d, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid first operand (expected z0..z15) + +sumop4a za0.d, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: sumop4a za0.d, z16.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4a za0.d, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: sumop4a za0.d, {z16.h-z17.h}, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid second operand (expected z16..z31) + +sumop4a za0.d, z14.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: sumop4a za0.d, z14.h, z14.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4a za0.d, z14.h, {z14.h-z15.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: sumop4a za0.d, z14.h, {z14.h-z15.h} +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + + +// ------------------------------------------------------------------------- // +// Invalid ZPR type suffix +// +// expected: .s => .b, .d => .h + +sumop4a za3.s, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.b..z14.b +// CHECK-NEXT: sumop4a za3.s, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4a za3.s, z0.b, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: sumop4a za3.s, z0.b, z16.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4a za3.d, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: sumop4a za3.d, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4a za3.d, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: sumop4a za3.d, z0.s, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/sumop4s-32.s b/llvm/test/MC/AArch64/SME2p2/sumop4s-32.s new file mode 100644 index 0000000000000..be9a25d85a015 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/sumop4s-32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +sumop4s za0.s, z0.b, z16.b // 10000000-00100000-10000000-00010000 +// CHECK-INST: sumop4s za0.s, z0.b, z16.b +// CHECK-ENCODING: [0x10,0x80,0x20,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80208010 + +sumop4s za1.s, z10.b, z20.b // 10000000-00100100-10000001-01010001 +// CHECK-INST: sumop4s za1.s, z10.b, z20.b +// CHECK-ENCODING: [0x51,0x81,0x24,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80248151 + +sumop4s za3.s, z14.b, z30.b // 10000000-00101110-10000001-11010011 +// CHECK-INST: sumop4s za3.s, z14.b, z30.b +// CHECK-ENCODING: [0xd3,0x81,0x2e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 802e81d3 + +sumop4s za0.s, z0.b, {z16.b-z17.b} // 10000000-00110000-10000000-00010000 +// CHECK-INST: sumop4s za0.s, z0.b, { z16.b, z17.b } +// CHECK-ENCODING: [0x10,0x80,0x30,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80308010 + +sumop4s za1.s, z10.b, {z20.b-z21.b} // 10000000-00110100-10000001-01010001 +// CHECK-INST: sumop4s za1.s, z10.b, { z20.b, z21.b } +// CHECK-ENCODING: [0x51,0x81,0x34,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80348151 + +sumop4s za3.s, z14.b, {z30.b-z31.b} // 10000000-00111110-10000001-11010011 +// CHECK-INST: sumop4s za3.s, z14.b, { z30.b, z31.b } +// CHECK-ENCODING: [0xd3,0x81,0x3e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 803e81d3 + +sumop4s za0.s, {z0.b-z1.b}, z16.b // 10000000-00100000-10000010-00010000 +// CHECK-INST: sumop4s za0.s, { z0.b, z1.b }, z16.b +// CHECK-ENCODING: [0x10,0x82,0x20,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80208210 + +sumop4s za1.s, {z10.b-z11.b}, z20.b // 10000000-00100100-10000011-01010001 +// CHECK-INST: sumop4s za1.s, { z10.b, z11.b }, z20.b +// CHECK-ENCODING: [0x51,0x83,0x24,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80248351 + +sumop4s za3.s, {z14.b-z15.b}, z30.b // 10000000-00101110-10000011-11010011 +// CHECK-INST: sumop4s za3.s, { z14.b, z15.b }, z30.b +// CHECK-ENCODING: [0xd3,0x83,0x2e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 802e83d3 + +sumop4s za0.s, {z0.b-z1.b}, {z16.b-z17.b} // 10000000-00110000-10000010-00010000 +// CHECK-INST: sumop4s za0.s, { z0.b, z1.b }, { z16.b, z17.b } +// CHECK-ENCODING: [0x10,0x82,0x30,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80308210 + +sumop4s za1.s, {z10.b-z11.b}, {z20.b-z21.b} // 10000000-00110100-10000011-01010001 +// CHECK-INST: sumop4s za1.s, { z10.b, z11.b }, { z20.b, z21.b } +// CHECK-ENCODING: [0x51,0x83,0x34,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80348351 + +sumop4s za3.s, {z14.b-z15.b}, {z30.b-z31.b} // 10000000-00111110-10000011-11010011 +// CHECK-INST: sumop4s za3.s, { z14.b, z15.b }, { z30.b, z31.b } +// CHECK-ENCODING: [0xd3,0x83,0x3e,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 803e83d3 diff --git a/llvm/test/MC/AArch64/SME2p2/sumop4s-64.s b/llvm/test/MC/AArch64/SME2p2/sumop4s-64.s new file mode 100644 index 0000000000000..dbb2b8ed53ea2 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/sumop4s-64.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-i16i64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-i16i64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +sumop4s za0.d, z0.h, z16.h // 10100000-11100000-00000000-00011000 +// CHECK-INST: sumop4s za0.d, z0.h, z16.h +// CHECK-ENCODING: [0x18,0x00,0xe0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0e00018 + +sumop4s za5.d, z10.h, z20.h // 10100000-11100100-00000001-01011101 +// CHECK-INST: sumop4s za5.d, z10.h, z20.h +// CHECK-ENCODING: [0x5d,0x01,0xe4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0e4015d + +sumop4s za7.d, z14.h, z30.h // 10100000-11101110-00000001-11011111 +// CHECK-INST: sumop4s za7.d, z14.h, z30.h +// CHECK-ENCODING: [0xdf,0x01,0xee,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0ee01df + +sumop4s za0.d, z0.h, {z16.h-z17.h} // 10100000-11110000-00000000-00011000 +// CHECK-INST: sumop4s za0.d, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x00,0xf0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0f00018 + +sumop4s za5.d, z10.h, {z20.h-z21.h} // 10100000-11110100-00000001-01011101 +// CHECK-INST: sumop4s za5.d, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x5d,0x01,0xf4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0f4015d + +sumop4s za7.d, z14.h, {z30.h-z31.h} // 10100000-11111110-00000001-11011111 +// CHECK-INST: sumop4s za7.d, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xdf,0x01,0xfe,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0fe01df + +sumop4s za0.d, {z0.h-z1.h}, z16.h // 10100000-11100000-00000010-00011000 +// CHECK-INST: sumop4s za0.d, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x18,0x02,0xe0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0e00218 + +sumop4s za5.d, {z10.h-z11.h}, z20.h // 10100000-11100100-00000011-01011101 +// CHECK-INST: sumop4s za5.d, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x5d,0x03,0xe4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0e4035d + +sumop4s za7.d, {z14.h-z15.h}, z30.h // 10100000-11101110-00000011-11011111 +// CHECK-INST: sumop4s za7.d, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xdf,0x03,0xee,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0ee03df + +sumop4s za0.d, {z0.h-z1.h}, {z16.h-z17.h} // 10100000-11110000-00000010-00011000 +// CHECK-INST: sumop4s za0.d, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x02,0xf0,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0f00218 + +sumop4s za5.d, {z10.h-z11.h}, {z20.h-z21.h} // 10100000-11110100-00000011-01011101 +// CHECK-INST: sumop4s za5.d, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x5d,0x03,0xf4,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0f4035d + +sumop4s za7.d, {z14.h-z15.h}, {z30.h-z31.h} // 10100000-11111110-00000011-11011111 +// CHECK-INST: sumop4s za7.d, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xdf,0x03,0xfe,0xa0] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a0fe03df diff --git a/llvm/test/MC/AArch64/SME2p2/sumop4s-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/sumop4s-diagnostics.s new file mode 100644 index 0000000000000..39a397d7b5671 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/sumop4s-diagnostics.s @@ -0,0 +1,68 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid tile +// +// expected: .s => za0-za3, .d => za0-za7 + +sumop4s za4.s, z0.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: sumop4s za4.s, z0.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4s za8.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: sumop4s za8.d, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid first operand (expected z0..z15) + +sumop4s za0.d, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: sumop4s za0.d, z16.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4s za0.d, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: sumop4s za0.d, {z16.h-z17.h}, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid second operand (expected z16..z31) + +sumop4s za0.d, z14.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: sumop4s za0.d, z14.h, z14.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4s za0.d, z14.h, {z14.h-z15.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: sumop4s za0.d, z14.h, {z14.h-z15.h} +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + + +// ------------------------------------------------------------------------- // +// Invalid ZPR type suffix +// +// expected: .s => .b, .d => .h + +sumop4s za3.s, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.b..z14.b +// CHECK-NEXT: sumop4s za3.s, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4s za3.s, z0.b, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: sumop4s za3.s, z0.b, z16.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4s za3.d, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: sumop4s za3.d, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sumop4s za3.d, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: sumop4s za3.d, z0.s, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/sutmopa-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/sutmopa-diagnostics.s new file mode 100644 index 0000000000000..087126c26fa64 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/sutmopa-diagnostics.s @@ -0,0 +1,76 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 2>&1 < %s| FileCheck %s + +// --------------------------------------------------------------------------// +// Invalid ZA register + +sutmopa za4.s, {z30.b-z31.b}, z31.b, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: sutmopa za4.s, {z30.b-z31.b}, z31.b, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid vector list operand + +sutmopa za3.s, {z29.b-z30.b}, z31.b, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: sutmopa za3.s, {z29.b-z30.b}, z31.b, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid ZK register + +sutmopa za3.s, {z28.b-z29.b}, z31.b, z19[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: sutmopa za3.s, {z28.b-z29.b}, z31.b, z19[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sutmopa za3.s, {z28.b-z29.b}, z31.b, z24[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: sutmopa za3.s, {z28.b-z29.b}, z31.b, z24[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sutmopa za3.s, {z28.b-z29.b}, z31.b, z27[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: sutmopa za3.s, {z28.b-z29.b}, z31.b, z27[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid immediate + +sutmopa za3.s, {z28.b-z29.b}, z31.b, z29[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +// CHECK-NEXT: sutmopa za3.s, {z28.b-z29.b}, z31.b, z29[4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid ZPR type suffix + +sutmopa za0.h, {z28.b-z29.b}, z31.b, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: sutmopa za0.h, {z28.b-z29.b}, z31.b, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sutmopa za0.h, {z28.h-z29.h}, z31.h, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: sutmopa za0.h, {z28.h-z29.h}, z31.h, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sutmopa za3.s, {z28.h-z29.h}, z31.h, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: sutmopa za3.s, {z28.h-z29.h}, z31.h, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sutmopa za3.s, {z28.s-z29.s}, z31.s, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: sutmopa za3.s, {z28.s-z29.s}, z31.s, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sutmopa za3.d, {z28.s-z29.s}, z31.s, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: sutmopa za3.d, {z28.s-z29.s}, z31.s, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +sutmopa za3.d, {z28.h-z29.h}, z31.h, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: sutmopa za3.d, {z28.h-z29.h}, z31.h, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/sutmopa.s b/llvm/test/MC/AArch64/SME2p2/sutmopa.s new file mode 100644 index 0000000000000..94a86a8a3f166 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/sutmopa.s @@ -0,0 +1,31 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +sutmopa za0.s, {z0.b-z1.b}, z0.b, z20[0] // 10000000-01100000-10000000-00000000 +// CHECK-INST: sutmopa za0.s, { z0.b, z1.b }, z0.b, z20[0] +// CHECK-ENCODING: [0x00,0x80,0x60,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80608000 + +sutmopa za1.s, {z10.b-z11.b}, z21.b, z29[1] // 10000000-01110101-10010101-01010001 +// CHECK-INST: sutmopa za1.s, { z10.b, z11.b }, z21.b, z29[1] +// CHECK-ENCODING: [0x51,0x95,0x75,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 80759551 + +sutmopa za3.s, {z30.b-z31.b}, z31.b, z31[3] // 10000000-01111111-10011111-11110011 +// CHECK-INST: sutmopa za3.s, { z30.b, z31.b }, z31.b, z31[3] +// CHECK-ENCODING: [0xf3,0x9f,0x7f,0x80] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 807f9ff3 diff --git a/llvm/test/MC/AArch64/SME2p2/umop4a-16to32.s b/llvm/test/MC/AArch64/SME2p2/umop4a-16to32.s new file mode 100644 index 0000000000000..15490565f8ecf --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/umop4a-16to32.s @@ -0,0 +1,86 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + + +umop4a za0.s, z0.h, z16.h // 10000001-00000000-10000000-00001000 +// CHECK-INST: umop4a za0.s, z0.h, z16.h +// CHECK-ENCODING: [0x08,0x80,0x00,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81008008 + +umop4a za3.s, z12.h, z24.h // 10000001-00001000-10000001-10001011 +// CHECK-INST: umop4a za3.s, z12.h, z24.h +// CHECK-ENCODING: [0x8b,0x81,0x08,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8108818b + +umop4a za3.s, z14.h, z30.h // 10000001-00001110-10000001-11001011 +// CHECK-INST: umop4a za3.s, z14.h, z30.h +// CHECK-ENCODING: [0xcb,0x81,0x0e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 810e81cb + +umop4a za0.s, z0.h, {z16.h-z17.h} // 10000001-00010000-10000000-00001000 +// CHECK-INST: umop4a za0.s, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x80,0x10,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81108008 + +umop4a za3.s, z12.h, {z24.h-z25.h} // 10000001-00011000-10000001-10001011 +// CHECK-INST: umop4a za3.s, z12.h, { z24.h, z25.h } +// CHECK-ENCODING: [0x8b,0x81,0x18,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8118818b + +umop4a za3.s, z14.h, {z30.h-z31.h} // 10000001-00011110-10000001-11001011 +// CHECK-INST: umop4a za3.s, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xcb,0x81,0x1e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 811e81cb + +umop4a za0.s, {z0.h-z1.h}, z16.h // 10000001-00000000-10000010-00001000 +// CHECK-INST: umop4a za0.s, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x08,0x82,0x00,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81008208 + +umop4a za3.s, {z12.h-z13.h}, z24.h // 10000001-00001000-10000011-10001011 +// CHECK-INST: umop4a za3.s, { z12.h, z13.h }, z24.h +// CHECK-ENCODING: [0x8b,0x83,0x08,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8108838b + +umop4a za3.s, {z14.h-z15.h}, z30.h // 10000001-00001110-10000011-11001011 +// CHECK-INST: umop4a za3.s, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xcb,0x83,0x0e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 810e83cb + +umop4a za0.s, {z0.h-z1.h}, {z16.h-z17.h} // 10000001-00010000-10000010-00001000 +// CHECK-INST: umop4a za0.s, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x82,0x10,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81108208 + +umop4a za3.s, {z12.h-z13.h}, {z24.h-z25.h} // 10000001-00011000-10000011-10001011 +// CHECK-INST: umop4a za3.s, { z12.h, z13.h }, { z24.h, z25.h } +// CHECK-ENCODING: [0x8b,0x83,0x18,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8118838b + +umop4a za3.s, {z14.h-z15.h}, {z30.h-z31.h} // 10000001-00011110-10000011-11001011 +// CHECK-INST: umop4a za3.s, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xcb,0x83,0x1e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 811e83cb diff --git a/llvm/test/MC/AArch64/SME2p2/umop4a-64.s b/llvm/test/MC/AArch64/SME2p2/umop4a-64.s new file mode 100644 index 0000000000000..c0f85af6abdbd --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/umop4a-64.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-i16i64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-i16i64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +umop4a za0.d, z0.h, z16.h // 10100001-11100000-00000000-00001000 +// CHECK-INST: umop4a za0.d, z0.h, z16.h +// CHECK-ENCODING: [0x08,0x00,0xe0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1e00008 + +umop4a za5.d, z10.h, z20.h // 10100001-11100100-00000001-01001101 +// CHECK-INST: umop4a za5.d, z10.h, z20.h +// CHECK-ENCODING: [0x4d,0x01,0xe4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1e4014d + +umop4a za7.d, z14.h, z30.h // 10100001-11101110-00000001-11001111 +// CHECK-INST: umop4a za7.d, z14.h, z30.h +// CHECK-ENCODING: [0xcf,0x01,0xee,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1ee01cf + +umop4a za0.d, z0.h, {z16.h-z17.h} // 10100001-11110000-00000000-00001000 +// CHECK-INST: umop4a za0.d, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x00,0xf0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1f00008 + +umop4a za5.d, z10.h, {z20.h-z21.h} // 10100001-11110100-00000001-01001101 +// CHECK-INST: umop4a za5.d, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x4d,0x01,0xf4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1f4014d + +umop4a za7.d, z14.h, {z30.h-z31.h} // 10100001-11111110-00000001-11001111 +// CHECK-INST: umop4a za7.d, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xcf,0x01,0xfe,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1fe01cf + +umop4a za0.d, {z0.h-z1.h}, z16.h // 10100001-11100000-00000010-00001000 +// CHECK-INST: umop4a za0.d, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x08,0x02,0xe0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1e00208 + +umop4a za5.d, {z10.h-z11.h}, z20.h // 10100001-11100100-00000011-01001101 +// CHECK-INST: umop4a za5.d, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x4d,0x03,0xe4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1e4034d + +umop4a za7.d, {z14.h-z15.h}, z30.h // 10100001-11101110-00000011-11001111 +// CHECK-INST: umop4a za7.d, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xcf,0x03,0xee,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1ee03cf + +umop4a za0.d, {z0.h-z1.h}, {z16.h-z17.h} // 10100001-11110000-00000010-00001000 +// CHECK-INST: umop4a za0.d, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x02,0xf0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1f00208 + +umop4a za5.d, {z10.h-z11.h}, {z20.h-z21.h} // 10100001-11110100-00000011-01001101 +// CHECK-INST: umop4a za5.d, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x4d,0x03,0xf4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1f4034d + +umop4a za7.d, {z14.h-z15.h}, {z30.h-z31.h} // 10100001-11111110-00000011-11001111 +// CHECK-INST: umop4a za7.d, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xcf,0x03,0xfe,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1fe03cf diff --git a/llvm/test/MC/AArch64/SME2p2/umop4a-8to32.s b/llvm/test/MC/AArch64/SME2p2/umop4a-8to32.s new file mode 100644 index 0000000000000..d22d2249709be --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/umop4a-8to32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +umop4a za0.s, z0.b, z16.b // 10000001-00100000-10000000-00000000 +// CHECK-INST: umop4a za0.s, z0.b, z16.b +// CHECK-ENCODING: [0x00,0x80,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81208000 + +umop4a za1.s, z10.b, z20.b // 10000001-00100100-10000001-01000001 +// CHECK-INST: umop4a za1.s, z10.b, z20.b +// CHECK-ENCODING: [0x41,0x81,0x24,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81248141 + +umop4a za3.s, z14.b, z30.b // 10000001-00101110-10000001-11000011 +// CHECK-INST: umop4a za3.s, z14.b, z30.b +// CHECK-ENCODING: [0xc3,0x81,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 812e81c3 + +umop4a za0.s, z0.b, {z16.b-z17.b} // 10000001-00110000-10000000-00000000 +// CHECK-INST: umop4a za0.s, z0.b, { z16.b, z17.b } +// CHECK-ENCODING: [0x00,0x80,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81308000 + +umop4a za1.s, z10.b, {z20.b-z21.b} // 10000001-00110100-10000001-01000001 +// CHECK-INST: umop4a za1.s, z10.b, { z20.b, z21.b } +// CHECK-ENCODING: [0x41,0x81,0x34,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81348141 + +umop4a za3.s, z14.b, {z30.b-z31.b} // 10000001-00111110-10000001-11000011 +// CHECK-INST: umop4a za3.s, z14.b, { z30.b, z31.b } +// CHECK-ENCODING: [0xc3,0x81,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 813e81c3 + +umop4a za0.s, {z0.b-z1.b}, z16.b // 10000001-00100000-10000010-00000000 +// CHECK-INST: umop4a za0.s, { z0.b, z1.b }, z16.b +// CHECK-ENCODING: [0x00,0x82,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81208200 + +umop4a za1.s, {z10.b-z11.b}, z20.b // 10000001-00100100-10000011-01000001 +// CHECK-INST: umop4a za1.s, { z10.b, z11.b }, z20.b +// CHECK-ENCODING: [0x41,0x83,0x24,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81248341 + +umop4a za3.s, {z14.b-z15.b}, z30.b // 10000001-00101110-10000011-11000011 +// CHECK-INST: umop4a za3.s, { z14.b, z15.b }, z30.b +// CHECK-ENCODING: [0xc3,0x83,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 812e83c3 + +umop4a za0.s, {z0.b-z1.b}, {z16.b-z17.b} // 10000001-00110000-10000010-00000000 +// CHECK-INST: umop4a za0.s, { z0.b, z1.b }, { z16.b, z17.b } +// CHECK-ENCODING: [0x00,0x82,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81308200 + +umop4a za1.s, {z10.b-z11.b}, {z20.b-z21.b} // 10000001-00110100-10000011-01000001 +// CHECK-INST: umop4a za1.s, { z10.b, z11.b }, { z20.b, z21.b } +// CHECK-ENCODING: [0x41,0x83,0x34,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81348341 + +umop4a za3.s, {z14.b-z15.b}, {z30.b-z31.b} // 10000001-00111110-10000011-11000011 +// CHECK-INST: umop4a za3.s, { z14.b, z15.b }, { z30.b, z31.b } +// CHECK-ENCODING: [0xc3,0x83,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 813e83c3 diff --git a/llvm/test/MC/AArch64/SME2p2/umop4a-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/umop4a-diagnostics.s new file mode 100644 index 0000000000000..a522ab1baacda --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/umop4a-diagnostics.s @@ -0,0 +1,82 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid tile +// +// expected: .s => za0-za3, .d => za0-za7 + +umop4a za4.s, z0.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: umop4a za4.s, z0.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za4.s, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: umop4a za4.s, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za8.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: umop4a za8.d, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid first operand (expected z0..z15) + +umop4a za0.d, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: umop4a za0.d, z16.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za0.s, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: umop4a za0.s, {z16.h-z17.h}, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za0.s, z16.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.b..z14.b +// CHECK-NEXT: umop4a za0.s, z16.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid second operand (expected z16..z31) + +umop4a za0.d, z14.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: umop4a za0.d, z14.h, z14.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za0.s, z14.h, {z14.h-z15.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: umop4a za0.s, z14.h, {z14.h-z15.h} +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za0.s, z14.b, z14.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: umop4a za0.s, z14.b, z14.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid ZPR type suffix +// +// expected: .s => .b, s => .h, .d => .h + +umop4a za3.s, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: umop4a za3.s, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za3.s, z0.b, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: umop4a za3.s, z0.b, z16.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za3.d, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: umop4a za3.d, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4a za3.d, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: umop4a za3.d, z0.s, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/umop4s-16to32.s b/llvm/test/MC/AArch64/SME2p2/umop4s-16to32.s new file mode 100644 index 0000000000000..c83925737005e --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/umop4s-16to32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +umop4s za0.s, z0.h, z16.h // 10000001-00000000-10000000-00011000 +// CHECK-INST: umop4s za0.s, z0.h, z16.h +// CHECK-ENCODING: [0x18,0x80,0x00,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81008018 + +umop4s za3.s, z12.h, z24.h // 10000001-00001000-10000001-10011011 +// CHECK-INST: umop4s za3.s, z12.h, z24.h +// CHECK-ENCODING: [0x9b,0x81,0x08,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8108819b + +umop4s za3.s, z14.h, z30.h // 10000001-00001110-10000001-11011011 +// CHECK-INST: umop4s za3.s, z14.h, z30.h +// CHECK-ENCODING: [0xdb,0x81,0x0e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 810e81db + +umop4s za0.s, z0.h, {z16.h-z17.h} // 10000001-00010000-10000000-00011000 +// CHECK-INST: umop4s za0.s, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x80,0x10,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81108018 + +umop4s za3.s, z12.h, {z24.h-z25.h} // 10000001-00011000-10000001-10011011 +// CHECK-INST: umop4s za3.s, z12.h, { z24.h, z25.h } +// CHECK-ENCODING: [0x9b,0x81,0x18,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8118819b + +umop4s za3.s, z14.h, {z30.h-z31.h} // 10000001-00011110-10000001-11011011 +// CHECK-INST: umop4s za3.s, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xdb,0x81,0x1e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 811e81db + +umop4s za0.s, {z0.h-z1.h}, z16.h // 10000001-00000000-10000010-00011000 +// CHECK-INST: umop4s za0.s, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x18,0x82,0x00,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81008218 + +umop4s za3.s, {z12.h-z13.h}, z24.h // 10000001-00001000-10000011-10011011 +// CHECK-INST: umop4s za3.s, { z12.h, z13.h }, z24.h +// CHECK-ENCODING: [0x9b,0x83,0x08,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8108839b + +umop4s za3.s, {z14.h-z15.h}, z30.h // 10000001-00001110-10000011-11011011 +// CHECK-INST: umop4s za3.s, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xdb,0x83,0x0e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 810e83db + +umop4s za0.s, {z0.h-z1.h}, {z16.h-z17.h} // 10000001-00010000-10000010-00011000 +// CHECK-INST: umop4s za0.s, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x82,0x10,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81108218 + +umop4s za3.s, {z12.h-z13.h}, {z24.h-z25.h} // 10000001-00011000-10000011-10011011 +// CHECK-INST: umop4s za3.s, { z12.h, z13.h }, { z24.h, z25.h } +// CHECK-ENCODING: [0x9b,0x83,0x18,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 8118839b + +umop4s za3.s, {z14.h-z15.h}, {z30.h-z31.h} // 10000001-00011110-10000011-11011011 +// CHECK-INST: umop4s za3.s, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xdb,0x83,0x1e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 811e83db diff --git a/llvm/test/MC/AArch64/SME2p2/umop4s-64.s b/llvm/test/MC/AArch64/SME2p2/umop4s-64.s new file mode 100644 index 0000000000000..5f39ca7cf373c --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/umop4s-64.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-i16i64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-i16i64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +umop4s za0.d, z0.h, z16.h // 10100001-11100000-00000000-00011000 +// CHECK-INST: umop4s za0.d, z0.h, z16.h +// CHECK-ENCODING: [0x18,0x00,0xe0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1e00018 + +umop4s za5.d, z10.h, z20.h // 10100001-11100100-00000001-01011101 +// CHECK-INST: umop4s za5.d, z10.h, z20.h +// CHECK-ENCODING: [0x5d,0x01,0xe4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1e4015d + +umop4s za7.d, z14.h, z30.h // 10100001-11101110-00000001-11011111 +// CHECK-INST: umop4s za7.d, z14.h, z30.h +// CHECK-ENCODING: [0xdf,0x01,0xee,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1ee01df + +umop4s za0.d, z0.h, {z16.h-z17.h} // 10100001-11110000-00000000-00011000 +// CHECK-INST: umop4s za0.d, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x00,0xf0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1f00018 + +umop4s za5.d, z10.h, {z20.h-z21.h} // 10100001-11110100-00000001-01011101 +// CHECK-INST: umop4s za5.d, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x5d,0x01,0xf4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1f4015d + +umop4s za7.d, z14.h, {z30.h-z31.h} // 10100001-11111110-00000001-11011111 +// CHECK-INST: umop4s za7.d, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xdf,0x01,0xfe,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1fe01df + +umop4s za0.d, {z0.h-z1.h}, z16.h // 10100001-11100000-00000010-00011000 +// CHECK-INST: umop4s za0.d, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x18,0x02,0xe0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1e00218 + +umop4s za5.d, {z10.h-z11.h}, z20.h // 10100001-11100100-00000011-01011101 +// CHECK-INST: umop4s za5.d, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x5d,0x03,0xe4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1e4035d + +umop4s za7.d, {z14.h-z15.h}, z30.h // 10100001-11101110-00000011-11011111 +// CHECK-INST: umop4s za7.d, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xdf,0x03,0xee,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1ee03df + +umop4s za0.d, {z0.h-z1.h}, {z16.h-z17.h} // 10100001-11110000-00000010-00011000 +// CHECK-INST: umop4s za0.d, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x02,0xf0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1f00218 + +umop4s za5.d, {z10.h-z11.h}, {z20.h-z21.h} // 10100001-11110100-00000011-01011101 +// CHECK-INST: umop4s za5.d, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x5d,0x03,0xf4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1f4035d + +umop4s za7.d, {z14.h-z15.h}, {z30.h-z31.h} // 10100001-11111110-00000011-11011111 +// CHECK-INST: umop4s za7.d, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xdf,0x03,0xfe,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1fe03df diff --git a/llvm/test/MC/AArch64/SME2p2/umop4s-8to32.s b/llvm/test/MC/AArch64/SME2p2/umop4s-8to32.s new file mode 100644 index 0000000000000..74cbffc5ef01d --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/umop4s-8to32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +umop4s za0.s, z0.b, z16.b // 10000001-00100000-10000000-00010000 +// CHECK-INST: umop4s za0.s, z0.b, z16.b +// CHECK-ENCODING: [0x10,0x80,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81208010 + +umop4s za1.s, z10.b, z20.b // 10000001-00100100-10000001-01010001 +// CHECK-INST: umop4s za1.s, z10.b, z20.b +// CHECK-ENCODING: [0x51,0x81,0x24,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81248151 + +umop4s za3.s, z14.b, z30.b // 10000001-00101110-10000001-11010011 +// CHECK-INST: umop4s za3.s, z14.b, z30.b +// CHECK-ENCODING: [0xd3,0x81,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 812e81d3 + +umop4s za0.s, z0.b, {z16.b-z17.b} // 10000001-00110000-10000000-00010000 +// CHECK-INST: umop4s za0.s, z0.b, { z16.b, z17.b } +// CHECK-ENCODING: [0x10,0x80,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81308010 + +umop4s za1.s, z10.b, {z20.b-z21.b} // 10000001-00110100-10000001-01010001 +// CHECK-INST: umop4s za1.s, z10.b, { z20.b, z21.b } +// CHECK-ENCODING: [0x51,0x81,0x34,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81348151 + +umop4s za3.s, z14.b, {z30.b-z31.b} // 10000001-00111110-10000001-11010011 +// CHECK-INST: umop4s za3.s, z14.b, { z30.b, z31.b } +// CHECK-ENCODING: [0xd3,0x81,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 813e81d3 + +umop4s za0.s, {z0.b-z1.b}, z16.b // 10000001-00100000-10000010-00010000 +// CHECK-INST: umop4s za0.s, { z0.b, z1.b }, z16.b +// CHECK-ENCODING: [0x10,0x82,0x20,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81208210 + +umop4s za1.s, {z10.b-z11.b}, z20.b // 10000001-00100100-10000011-01010001 +// CHECK-INST: umop4s za1.s, { z10.b, z11.b }, z20.b +// CHECK-ENCODING: [0x51,0x83,0x24,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81248351 + +umop4s za3.s, {z14.b-z15.b}, z30.b // 10000001-00101110-10000011-11010011 +// CHECK-INST: umop4s za3.s, { z14.b, z15.b }, z30.b +// CHECK-ENCODING: [0xd3,0x83,0x2e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 812e83d3 + +umop4s za0.s, {z0.b-z1.b}, {z16.b-z17.b} // 10000001-00110000-10000010-00010000 +// CHECK-INST: umop4s za0.s, { z0.b, z1.b }, { z16.b, z17.b } +// CHECK-ENCODING: [0x10,0x82,0x30,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81308210 + +umop4s za1.s, {z10.b-z11.b}, {z20.b-z21.b} // 10000001-00110100-10000011-01010001 +// CHECK-INST: umop4s za1.s, { z10.b, z11.b }, { z20.b, z21.b } +// CHECK-ENCODING: [0x51,0x83,0x34,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81348351 + +umop4s za3.s, {z14.b-z15.b}, {z30.b-z31.b} // 10000001-00111110-10000011-11010011 +// CHECK-INST: umop4s za3.s, { z14.b, z15.b }, { z30.b, z31.b } +// CHECK-ENCODING: [0xd3,0x83,0x3e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 813e83d3 diff --git a/llvm/test/MC/AArch64/SME2p2/umop4s-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/umop4s-diagnostics.s new file mode 100644 index 0000000000000..c5f9877522e3d --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/umop4s-diagnostics.s @@ -0,0 +1,82 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid tile +// +// expected: .s => za0-za3, .d => za0-za7 + +umop4s za4.s, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: umop4s za4.s, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za4.s, z0.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: umop4s za4.s, z0.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za8.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: umop4s za8.d, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid first operand (expected z0..z15) + +umop4s za0.d, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: umop4s za0.d, z16.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za0.s, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: umop4s za0.s, {z16.h-z17.h}, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za0.s, z16.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.b..z14.b +// CHECK-NEXT: umop4s za0.s, z16.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}} + +// ------------------------------------------------------------------------- // +// Invalid second operand (expected z16..z31) + +umop4s za0.d, z14.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: umop4s za0.d, z14.h, z14.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za0.s, z14.h, {z14.h-z15.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: umop4s za0.s, z14.h, {z14.h-z15.h} +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za0.s, z14.b, z14.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: umop4s za0.s, z14.b, z14.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid ZPR type suffix +// +// expected: .s => .b, s => .h, .d => .h + +umop4s za3.s, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: umop4s za3.s, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za3.s, z0.b, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: umop4s za3.s, z0.b, z16.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za3.d, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: umop4s za3.d, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +umop4s za3.d, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: umop4s za3.d, z0.s, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/usmop4a-32.s b/llvm/test/MC/AArch64/SME2p2/usmop4a-32.s new file mode 100644 index 0000000000000..2bbddfd1e64e0 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/usmop4a-32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +usmop4a za0.s, z0.b, z16.b // 10000001-00000000-10000000-00000000 +// CHECK-INST: usmop4a za0.s, z0.b, z16.b +// CHECK-ENCODING: [0x00,0x80,0x00,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81008000 + +usmop4a za1.s, z10.b, z20.b // 10000001-00000100-10000001-01000001 +// CHECK-INST: usmop4a za1.s, z10.b, z20.b +// CHECK-ENCODING: [0x41,0x81,0x04,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81048141 + +usmop4a za3.s, z14.b, z30.b // 10000001-00001110-10000001-11000011 +// CHECK-INST: usmop4a za3.s, z14.b, z30.b +// CHECK-ENCODING: [0xc3,0x81,0x0e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 810e81c3 + +usmop4a za0.s, z0.b, {z16.b-z17.b} // 10000001-00010000-10000000-00000000 +// CHECK-INST: usmop4a za0.s, z0.b, { z16.b, z17.b } +// CHECK-ENCODING: [0x00,0x80,0x10,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81108000 + +usmop4a za1.s, z10.b, {z20.b-z21.b} // 10000001-00010100-10000001-01000001 +// CHECK-INST: usmop4a za1.s, z10.b, { z20.b, z21.b } +// CHECK-ENCODING: [0x41,0x81,0x14,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81148141 + +usmop4a za3.s, z14.b, {z30.b-z31.b} // 10000001-00011110-10000001-11000011 +// CHECK-INST: usmop4a za3.s, z14.b, { z30.b, z31.b } +// CHECK-ENCODING: [0xc3,0x81,0x1e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 811e81c3 + +usmop4a za0.s, {z0.b-z1.b}, z16.b // 10000001-00000000-10000010-00000000 +// CHECK-INST: usmop4a za0.s, { z0.b, z1.b }, z16.b +// CHECK-ENCODING: [0x00,0x82,0x00,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81008200 + +usmop4a za1.s, {z10.b-z11.b}, z20.b // 10000001-00000100-10000011-01000001 +// CHECK-INST: usmop4a za1.s, { z10.b, z11.b }, z20.b +// CHECK-ENCODING: [0x41,0x83,0x04,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81048341 + +usmop4a za3.s, {z14.b-z15.b}, z30.b // 10000001-00001110-10000011-11000011 +// CHECK-INST: usmop4a za3.s, { z14.b, z15.b }, z30.b +// CHECK-ENCODING: [0xc3,0x83,0x0e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 810e83c3 + +usmop4a za0.s, {z0.b-z1.b}, {z16.b-z17.b} // 10000001-00010000-10000010-00000000 +// CHECK-INST: usmop4a za0.s, { z0.b, z1.b }, { z16.b, z17.b } +// CHECK-ENCODING: [0x00,0x82,0x10,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81108200 + +usmop4a za1.s, {z10.b-z11.b}, {z20.b-z21.b} // 10000001-00010100-10000011-01000001 +// CHECK-INST: usmop4a za1.s, { z10.b, z11.b }, { z20.b, z21.b } +// CHECK-ENCODING: [0x41,0x83,0x14,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81148341 + +usmop4a za3.s, {z14.b-z15.b}, {z30.b-z31.b} // 10000001-00011110-10000011-11000011 +// CHECK-INST: usmop4a za3.s, { z14.b, z15.b }, { z30.b, z31.b } +// CHECK-ENCODING: [0xc3,0x83,0x1e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 811e83c3 diff --git a/llvm/test/MC/AArch64/SME2p2/usmop4a-64.s b/llvm/test/MC/AArch64/SME2p2/usmop4a-64.s new file mode 100644 index 0000000000000..1da7b31d1b94f --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/usmop4a-64.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-i16i64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-i16i64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +usmop4a za0.d, z0.h, z16.h // 10100001-11000000-00000000-00001000 +// CHECK-INST: usmop4a za0.d, z0.h, z16.h +// CHECK-ENCODING: [0x08,0x00,0xc0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1c00008 + +usmop4a za5.d, z10.h, z20.h // 10100001-11000100-00000001-01001101 +// CHECK-INST: usmop4a za5.d, z10.h, z20.h +// CHECK-ENCODING: [0x4d,0x01,0xc4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1c4014d + +usmop4a za7.d, z14.h, z30.h // 10100001-11001110-00000001-11001111 +// CHECK-INST: usmop4a za7.d, z14.h, z30.h +// CHECK-ENCODING: [0xcf,0x01,0xce,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1ce01cf + +usmop4a za0.d, z0.h, {z16.h-z17.h} // 10100001-11010000-00000000-00001000 +// CHECK-INST: usmop4a za0.d, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x00,0xd0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1d00008 + +usmop4a za5.d, z10.h, {z20.h-z21.h} // 10100001-11010100-00000001-01001101 +// CHECK-INST: usmop4a za5.d, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x4d,0x01,0xd4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1d4014d + +usmop4a za7.d, z14.h, {z30.h-z31.h} // 10100001-11011110-00000001-11001111 +// CHECK-INST: usmop4a za7.d, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xcf,0x01,0xde,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1de01cf + +usmop4a za0.d, {z0.h-z1.h}, z16.h // 10100001-11000000-00000010-00001000 +// CHECK-INST: usmop4a za0.d, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x08,0x02,0xc0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1c00208 + +usmop4a za5.d, {z10.h-z11.h}, z20.h // 10100001-11000100-00000011-01001101 +// CHECK-INST: usmop4a za5.d, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x4d,0x03,0xc4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1c4034d + +usmop4a za7.d, {z14.h-z15.h}, z30.h // 10100001-11001110-00000011-11001111 +// CHECK-INST: usmop4a za7.d, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xcf,0x03,0xce,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1ce03cf + +usmop4a za0.d, {z0.h-z1.h}, {z16.h-z17.h} // 10100001-11010000-00000010-00001000 +// CHECK-INST: usmop4a za0.d, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x08,0x02,0xd0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1d00208 + +usmop4a za5.d, {z10.h-z11.h}, {z20.h-z21.h} // 10100001-11010100-00000011-01001101 +// CHECK-INST: usmop4a za5.d, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x4d,0x03,0xd4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1d4034d + +usmop4a za7.d, {z14.h-z15.h}, {z30.h-z31.h} // 10100001-11011110-00000011-11001111 +// CHECK-INST: usmop4a za7.d, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xcf,0x03,0xde,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1de03cf diff --git a/llvm/test/MC/AArch64/SME2p2/usmop4a-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/usmop4a-diagnostics.s new file mode 100644 index 0000000000000..18d9963a3274f --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/usmop4a-diagnostics.s @@ -0,0 +1,68 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid tile +// +// expected: .s => za0-za3, .d => za0-za7 + +usmop4a za4.s, z0.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: usmop4a za4.s, z0.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4a za8.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: usmop4a za8.d, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid first operand (expected z0..z15) + +usmop4a za0.d, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: usmop4a za0.d, z16.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4a za0.d, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: usmop4a za0.d, {z16.h-z17.h}, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid second operand (expected z16..z31) + +usmop4a za0.d, z14.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: usmop4a za0.d, z14.h, z14.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4a za0.d, z14.h, {z14.h-z15.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: usmop4a za0.d, z14.h, {z14.h-z15.h} +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + + +// ------------------------------------------------------------------------- // +// Invalid ZPR type suffix +// +// expected: .s => .b, .d => .h + +usmop4a za3.s, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.b..z14.b +// CHECK-NEXT: usmop4a za3.s, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4a za3.s, z0.b, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: usmop4a za3.s, z0.b, z16.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4a za3.d, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: usmop4a za3.d, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4a za3.d, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: usmop4a za3.d, z0.s, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/usmop4s-32.s b/llvm/test/MC/AArch64/SME2p2/usmop4s-32.s new file mode 100644 index 0000000000000..e8cc918e55d02 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/usmop4s-32.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +usmop4s za0.s, z0.b, z16.b // 10000001-00000000-10000000-00010000 +// CHECK-INST: usmop4s za0.s, z0.b, z16.b +// CHECK-ENCODING: [0x10,0x80,0x00,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81008010 + +usmop4s za1.s, z10.b, z20.b // 10000001-00000100-10000001-01010001 +// CHECK-INST: usmop4s za1.s, z10.b, z20.b +// CHECK-ENCODING: [0x51,0x81,0x04,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81048151 + +usmop4s za3.s, z14.b, z30.b // 10000001-00001110-10000001-11010011 +// CHECK-INST: usmop4s za3.s, z14.b, z30.b +// CHECK-ENCODING: [0xd3,0x81,0x0e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 810e81d3 + +usmop4s za0.s, z0.b, {z16.b-z17.b} // 10000001-00010000-10000000-00010000 +// CHECK-INST: usmop4s za0.s, z0.b, { z16.b, z17.b } +// CHECK-ENCODING: [0x10,0x80,0x10,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81108010 + +usmop4s za1.s, z10.b, {z20.b-z21.b} // 10000001-00010100-10000001-01010001 +// CHECK-INST: usmop4s za1.s, z10.b, { z20.b, z21.b } +// CHECK-ENCODING: [0x51,0x81,0x14,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81148151 + +usmop4s za3.s, z14.b, {z30.b-z31.b} // 10000001-00011110-10000001-11010011 +// CHECK-INST: usmop4s za3.s, z14.b, { z30.b, z31.b } +// CHECK-ENCODING: [0xd3,0x81,0x1e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 811e81d3 + +usmop4s za0.s, {z0.b-z1.b}, z16.b // 10000001-00000000-10000010-00010000 +// CHECK-INST: usmop4s za0.s, { z0.b, z1.b }, z16.b +// CHECK-ENCODING: [0x10,0x82,0x00,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81008210 + +usmop4s za1.s, {z10.b-z11.b}, z20.b // 10000001-00000100-10000011-01010001 +// CHECK-INST: usmop4s za1.s, { z10.b, z11.b }, z20.b +// CHECK-ENCODING: [0x51,0x83,0x04,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81048351 + +usmop4s za3.s, {z14.b-z15.b}, z30.b // 10000001-00001110-10000011-11010011 +// CHECK-INST: usmop4s za3.s, { z14.b, z15.b }, z30.b +// CHECK-ENCODING: [0xd3,0x83,0x0e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 810e83d3 + +usmop4s za0.s, {z0.b-z1.b}, {z16.b-z17.b} // 10000001-00010000-10000010-00010000 +// CHECK-INST: usmop4s za0.s, { z0.b, z1.b }, { z16.b, z17.b } +// CHECK-ENCODING: [0x10,0x82,0x10,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81108210 + +usmop4s za1.s, {z10.b-z11.b}, {z20.b-z21.b} // 10000001-00010100-10000011-01010001 +// CHECK-INST: usmop4s za1.s, { z10.b, z11.b }, { z20.b, z21.b } +// CHECK-ENCODING: [0x51,0x83,0x14,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81148351 + +usmop4s za3.s, {z14.b-z15.b}, {z30.b-z31.b} // 10000001-00011110-10000011-11010011 +// CHECK-INST: usmop4s za3.s, { z14.b, z15.b }, { z30.b, z31.b } +// CHECK-ENCODING: [0xd3,0x83,0x1e,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 811e83d3 diff --git a/llvm/test/MC/AArch64/SME2p2/usmop4s-64.s b/llvm/test/MC/AArch64/SME2p2/usmop4s-64.s new file mode 100644 index 0000000000000..22df9e07158d2 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/usmop4s-64.s @@ -0,0 +1,85 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2,+sme-i16i64 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2,+sme-i16i64 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +usmop4s za0.d, z0.h, z16.h // 10100001-11000000-00000000-00011000 +// CHECK-INST: usmop4s za0.d, z0.h, z16.h +// CHECK-ENCODING: [0x18,0x00,0xc0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1c00018 + +usmop4s za5.d, z10.h, z20.h // 10100001-11000100-00000001-01011101 +// CHECK-INST: usmop4s za5.d, z10.h, z20.h +// CHECK-ENCODING: [0x5d,0x01,0xc4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1c4015d + +usmop4s za7.d, z14.h, z30.h // 10100001-11001110-00000001-11011111 +// CHECK-INST: usmop4s za7.d, z14.h, z30.h +// CHECK-ENCODING: [0xdf,0x01,0xce,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1ce01df + +usmop4s za0.d, z0.h, {z16.h-z17.h} // 10100001-11010000-00000000-00011000 +// CHECK-INST: usmop4s za0.d, z0.h, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x00,0xd0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1d00018 + +usmop4s za5.d, z10.h, {z20.h-z21.h} // 10100001-11010100-00000001-01011101 +// CHECK-INST: usmop4s za5.d, z10.h, { z20.h, z21.h } +// CHECK-ENCODING: [0x5d,0x01,0xd4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1d4015d + +usmop4s za7.d, z14.h, {z30.h-z31.h} // 10100001-11011110-00000001-11011111 +// CHECK-INST: usmop4s za7.d, z14.h, { z30.h, z31.h } +// CHECK-ENCODING: [0xdf,0x01,0xde,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1de01df + +usmop4s za0.d, {z0.h-z1.h}, z16.h // 10100001-11000000-00000010-00011000 +// CHECK-INST: usmop4s za0.d, { z0.h, z1.h }, z16.h +// CHECK-ENCODING: [0x18,0x02,0xc0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1c00218 + +usmop4s za5.d, {z10.h-z11.h}, z20.h // 10100001-11000100-00000011-01011101 +// CHECK-INST: usmop4s za5.d, { z10.h, z11.h }, z20.h +// CHECK-ENCODING: [0x5d,0x03,0xc4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1c4035d + +usmop4s za7.d, {z14.h-z15.h}, z30.h // 10100001-11001110-00000011-11011111 +// CHECK-INST: usmop4s za7.d, { z14.h, z15.h }, z30.h +// CHECK-ENCODING: [0xdf,0x03,0xce,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1ce03df + +usmop4s za0.d, {z0.h-z1.h}, {z16.h-z17.h} // 10100001-11010000-00000010-00011000 +// CHECK-INST: usmop4s za0.d, { z0.h, z1.h }, { z16.h, z17.h } +// CHECK-ENCODING: [0x18,0x02,0xd0,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1d00218 + +usmop4s za5.d, {z10.h-z11.h}, {z20.h-z21.h} // 10100001-11010100-00000011-01011101 +// CHECK-INST: usmop4s za5.d, { z10.h, z11.h }, { z20.h, z21.h } +// CHECK-ENCODING: [0x5d,0x03,0xd4,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1d4035d + +usmop4s za7.d, {z14.h-z15.h}, {z30.h-z31.h} // 10100001-11011110-00000011-11011111 +// CHECK-INST: usmop4s za7.d, { z14.h, z15.h }, { z30.h, z31.h } +// CHECK-ENCODING: [0xdf,0x03,0xde,0xa1] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: a1de03df diff --git a/llvm/test/MC/AArch64/SME2p2/usmop4s-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/usmop4s-diagnostics.s new file mode 100644 index 0000000000000..07353b6350149 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/usmop4s-diagnostics.s @@ -0,0 +1,68 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2,+sme-i16i64 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid tile +// +// expected: .s => za0-za3, .d => za0-za7 + +usmop4s za4.s, z0.b, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: usmop4s za4.s, z0.b, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4s za8.d, z0.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: usmop4s za8.d, z0.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid first operand (expected z0..z15) + +usmop4s za0.d, z16.h, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: usmop4s za0.d, z16.h, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4s za0.d, {z16.h-z17.h}, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: usmop4s za0.d, {z16.h-z17.h}, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// ------------------------------------------------------------------------- // +// Invalid second operand (expected z16..z31) + +usmop4s za0.d, z14.h, z14.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: usmop4s za0.d, z14.h, z14.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4s za0.d, z14.h, {z14.h-z15.h} +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: usmop4s za0.d, z14.h, {z14.h-z15.h} +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + + +// ------------------------------------------------------------------------- // +// Invalid ZPR type suffix +// +// expected: .s => .b, .d => .h + +usmop4s za3.s, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.b..z14.b +// CHECK-NEXT: usmop4s za3.s, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4s za3.s, z0.b, z16.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.b..z30.b +// CHECK-NEXT: usmop4s za3.s, z0.b, z16.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4s za3.d, z0.h, z16.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h +// CHECK-NEXT: usmop4s za3.d, z0.h, z16.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +usmop4s za3.d, z0.s, z16.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h +// CHECK-NEXT: usmop4s za3.d, z0.s, z16.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/ustmopa-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/ustmopa-diagnostics.s new file mode 100644 index 0000000000000..bba7a5abf67c7 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/ustmopa-diagnostics.s @@ -0,0 +1,76 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 2>&1 < %s| FileCheck %s + +// --------------------------------------------------------------------------// +// Invalid ZA register + +ustmopa za4.s, {z30.b-z31.b}, z31.b, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ustmopa za4.s, {z30.b-z31.b}, z31.b, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid vector list operand + +ustmopa za3.s, {z29.b-z30.b}, z31.b, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: ustmopa za3.s, {z29.b-z30.b}, z31.b, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid ZK register + +ustmopa za3.s, {z28.b-z29.b}, z31.b, z19[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: ustmopa za3.s, {z28.b-z29.b}, z31.b, z19[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ustmopa za3.s, {z28.b-z29.b}, z31.b, z24[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: ustmopa za3.s, {z28.b-z29.b}, z31.b, z24[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ustmopa za3.s, {z28.b-z29.b}, z31.b, z27[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: ustmopa za3.s, {z28.b-z29.b}, z31.b, z27[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid immediate + +ustmopa za3.s, {z28.b-z29.b}, z31.b, z29[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +// CHECK-NEXT: ustmopa za3.s, {z28.b-z29.b}, z31.b, z29[4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid ZPR type suffix + +ustmopa za0.h, {z28.b-z29.b}, z31.b, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: ustmopa za0.h, {z28.b-z29.b}, z31.b, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ustmopa za0.h, {z28.h-z29.h}, z31.h, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: ustmopa za0.h, {z28.h-z29.h}, z31.h, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ustmopa za3.s, {z28.h-z29.h}, z31.h, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ustmopa za3.s, {z28.h-z29.h}, z31.h, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ustmopa za3.s, {z28.s-z29.s}, z31.s, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: ustmopa za3.s, {z28.s-z29.s}, z31.s, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ustmopa za3.d, {z28.s-z29.s}, z31.s, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: ustmopa za3.d, {z28.s-z29.s}, z31.s, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +ustmopa za3.d, {z28.h-z29.h}, z31.h, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: ustmopa za3.d, {z28.h-z29.h}, z31.h, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/ustmopa.s b/llvm/test/MC/AArch64/SME2p2/ustmopa.s new file mode 100644 index 0000000000000..626b5b436dcb7 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/ustmopa.s @@ -0,0 +1,31 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +ustmopa za0.s, {z0.b-z1.b}, z0.b, z20[0] // 10000001-01000000-10000000-00000000 +// CHECK-INST: ustmopa za0.s, { z0.b, z1.b }, z0.b, z20[0] +// CHECK-ENCODING: [0x00,0x80,0x40,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81408000 + +ustmopa za3.s, {z12.b-z13.b}, z8.b, z23[3] // 10000001-01001000-10001101-10110011 +// CHECK-INST: ustmopa za3.s, { z12.b, z13.b }, z8.b, z23[3] +// CHECK-ENCODING: [0xb3,0x8d,0x48,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81488db3 + +ustmopa za3.s, {z30.b-z31.b}, z31.b, z31[3] // 10000001-01011111-10011111-11110011 +// CHECK-INST: ustmopa za3.s, { z30.b, z31.b }, z31.b, z31[3] +// CHECK-ENCODING: [0xf3,0x9f,0x5f,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 815f9ff3 diff --git a/llvm/test/MC/AArch64/SME2p2/utmopa-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/utmopa-diagnostics.s new file mode 100644 index 0000000000000..fed08e4d7cc08 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/utmopa-diagnostics.s @@ -0,0 +1,100 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 2>&1 < %s| FileCheck %s + +// --------------------------------------------------------------------------// +// Invalid ZA register + +utmopa za4.s, {z30.b-z31.b}, z31.b, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: utmopa za4.s, {z30.b-z31.b}, z31.b, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za4.s, {z30.h-z31.h}, z31.h, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: utmopa za4.s, {z30.h-z31.h}, z31.h, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid vector list operand + +utmopa za3.s, {z29.b-z30.b}, z31.b, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: utmopa za3.s, {z29.b-z30.b}, z31.b, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.s, {z29.h-z30.h}, z31.h, z31[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types +// CHECK-NEXT: utmopa za3.s, {z29.h-z30.h}, z31.h, z31[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid ZK register +utmopa za3.s, {z28.b-z29.b}, z31.b, z19[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: utmopa za3.s, {z28.b-z29.b}, z31.b, z19[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.s, {z28.b-z29.b}, z31.b, z24[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: utmopa za3.s, {z28.b-z29.b}, z31.b, z24[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.s, {z28.b-z29.b}, z31.b, z27[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: utmopa za3.s, {z28.b-z29.b}, z31.b, z27[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.s, {z28.h-z29.h}, z31.h, z19[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: utmopa za3.s, {z28.h-z29.h}, z31.h, z19[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.s, {z28.h-z29.h}, z31.h, z24[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: utmopa za3.s, {z28.h-z29.h}, z31.h, z24[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.s, {z28.h-z29.h}, z31.h, z27[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted vector register, expected register in z20..z23 or z28..z31 +// CHECK-NEXT: utmopa za3.s, {z28.h-z29.h}, z31.h, z27[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid immediate + +utmopa za3.s, {z28.b-z29.b}, z31.b, z20[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +// CHECK-NEXT: utmopa za3.s, {z28.b-z29.b}, z31.b, z20[4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.s, {z28.h-z29.h}, z31.h, z20[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +// CHECK-NEXT: utmopa za3.s, {z28.h-z29.h}, z31.h, z20[4] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid ZPR type suffix + +utmopa za0.h, {z28.b-z29.b}, z31.b, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: utmopa za0.h, {z28.b-z29.b}, z31.b, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za0.h, {z28.h-z29.h}, z31.h, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: utmopa za0.h, {z28.h-z29.h}, z31.h, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.s, {z28.s-z29.s}, z31.s, z20[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: utmopa za3.s, {z28.s-z29.s}, z31.s, z20[2] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.d, {z28.s-z29.s}, z31.s, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: utmopa za3.d, {z28.s-z29.s}, z31.s, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +utmopa za3.d, {z28.h-z29.h}, z31.h, z20[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s +// CHECK-NEXT: utmopa za3.d, {z28.h-z29.h}, z31.h, z20[3] +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: diff --git a/llvm/test/MC/AArch64/SME2p2/utmopa.s b/llvm/test/MC/AArch64/SME2p2/utmopa.s new file mode 100644 index 0000000000000..bd127e59ece08 --- /dev/null +++ b/llvm/test/MC/AArch64/SME2p2/utmopa.s @@ -0,0 +1,49 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sme2p2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sme2p2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +utmopa za0.s, {z0.h-z1.h}, z0.h, z20[0] // 10000001-01000000-10000000-00001000 +// CHECK-INST: utmopa za0.s, { z0.h, z1.h }, z0.h, z20[0] +// CHECK-ENCODING: [0x08,0x80,0x40,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81408008 + +utmopa za3.s, {z12.h-z13.h}, z8.h, z23[3] // 10000001-01001000-10001101-10111011 +// CHECK-INST: utmopa za3.s, { z12.h, z13.h }, z8.h, z23[3] +// CHECK-ENCODING: [0xbb,0x8d,0x48,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81488dbb + +utmopa za3.s, {z30.h-z31.h}, z31.h, z31[3] // 10000001-01011111-10011111-11111011 +// CHECK-INST: utmopa za3.s, { z30.h, z31.h }, z31.h, z31[3] +// CHECK-ENCODING: [0xfb,0x9f,0x5f,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 815f9ffb + +utmopa za0.s, {z0.b-z1.b}, z0.b, z20[0] // 10000001-01100000-10000000-00000000 +// CHECK-INST: utmopa za0.s, { z0.b, z1.b }, z0.b, z20[0] +// CHECK-ENCODING: [0x00,0x80,0x60,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81608000 + +utmopa za3.s, {z12.b-z13.b}, z8.b, z23[3] // 10000001-01101000-10001101-10110011 +// CHECK-INST: utmopa za3.s, { z12.b, z13.b }, z8.b, z23[3] +// CHECK-ENCODING: [0xb3,0x8d,0x68,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 81688db3 + +utmopa za3.s, {z30.b-z31.b}, z31.b, z31[3] // 10000001-01111111-10011111-11110011 +// CHECK-INST: utmopa za3.s, { z30.b, z31.b }, z31.b, z31[3] +// CHECK-ENCODING: [0xf3,0x9f,0x7f,0x81] +// CHECK-ERROR: instruction requires: sme2p2 +// CHECK-UNKNOWN: 817f9ff3 diff --git a/llvm/test/MC/AArch64/SVE2p2/rbit_z-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/rbit_z-diagnostics.s new file mode 100644 index 0000000000000..e20d8c4c1b97f --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/rbit_z-diagnostics.s @@ -0,0 +1,74 @@ +/ RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid predicate + +rbit z0.b, p8/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: rbit z0.b, p8/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +rbit z0.h, p8/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: rbit z0.h, p8/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +rbit z0.s, p8/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: rbit z0.s, p8/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +rbit z0.d, p8/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: rbit z0.d, p8/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid element widths + +rbit z0.b, p7/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: rbit z0.b, p7/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +rbit z0.h, p7/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: rbit z0.h, p7/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +rbit z0.s, p7/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: rbit z0.s, p7/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +rbit z0.d, p7/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: rbit z0.d, p7/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Negative tests for instructions that are incompatible with movprfx + +movprfx z0.b, p0/z, z7.b +rbit z0.b, p0/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: rbit z0.b, p0/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +movprfx z0, z7 +rbit z0.h, p0/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: rbit z0.h, p0/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +movprfx z0.s, p0/z, z7.s +rbit z0.s, p0/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: rbit z0.s, p0/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +movprfx z0, z7 +rbit z0.d, p0/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: rbit z0.d, p0/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/rbit_z.s b/llvm/test/MC/AArch64/SVE2p2/rbit_z.s new file mode 100644 index 0000000000000..3eb9c2d79306f --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/rbit_z.s @@ -0,0 +1,45 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sve - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +rbit z0.b, p0/z, z0.b // 00000101-00100111-10100000-00000000 +// CHECK-INST: rbit z0.b, p0/z, z0.b +// CHECK-ENCODING: [0x00,0xa0,0x27,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 0527a000 + +rbit z21.b, p5/z, z10.b // 00000101-00100111-10110101-01010101 +// CHECK-INST: rbit z21.b, p5/z, z10.b +// CHECK-ENCODING: [0x55,0xb5,0x27,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 0527b555 + +rbit z23.h, p3/z, z13.h // 00000101-01100111-10101101-10110111 +// CHECK-INST: rbit z23.h, p3/z, z13.h +// CHECK-ENCODING: [0xb7,0xad,0x67,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 0567adb7 + +rbit z23.s, p3/z, z13.s // 00000101-10100111-10101101-10110111 +// CHECK-INST: rbit z23.s, p3/z, z13.s +// CHECK-ENCODING: [0xb7,0xad,0xa7,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05a7adb7 + +rbit z31.d, p7/z, z31.d // 00000101-11100111-10111111-11111111 +// CHECK-INST: rbit z31.d, p7/z, z31.d +// CHECK-ENCODING: [0xff,0xbf,0xe7,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05e7bfff \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/revb_z-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/revb_z-diagnostics.s new file mode 100644 index 0000000000000..4cf40f8fd7923 --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/revb_z-diagnostics.s @@ -0,0 +1,63 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid predicate + +revb z0.h, p8/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: revb z0.h, p8/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revb z0.s, p8/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: revb z0.s, p8/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revb z0.d, p8/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: revb z0.d, p8/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid element widths + +revb z0.b, p7/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revb z0.b, p7/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revb z0.h, p7/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revb z0.h, p7/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revb z0.s, p7/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revb z0.s, p7/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revb z0.d, p7/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revb z0.d, p7/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Negative tests for instructions that are incompatible with movprfx + +movprfx z0, z7 +revb z0.h, p0/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: revb z0.h, p0/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +movprfx z0.s, p0/z, z7.s +revb z0.s, p0/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: revb z0.s, p0/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +movprfx z0, z7 +revb z0.d, p0/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: revb z0.d, p0/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/revb_z.s b/llvm/test/MC/AArch64/SVE2p2/revb_z.s new file mode 100644 index 0000000000000..16dee586bd1d1 --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/revb_z.s @@ -0,0 +1,33 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sve - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +revb z0.h, p0/z, z0.h // 00000101-01100100-10100000-00000000 +// CHECK-INST: revb z0.h, p0/z, z0.h +// CHECK-ENCODING: [0x00,0xa0,0x64,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 0564a000 + +revb z23.s, p3/z, z13.s // 00000101-10100100-10101101-10110111 +// CHECK-INST: revb z23.s, p3/z, z13.s +// CHECK-ENCODING: [0xb7,0xad,0xa4,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05a4adb7 + +revb z31.d, p7/z, z31.d // 00000101-11100100-10111111-11111111 +// CHECK-INST: revb z31.d, p7/z, z31.d +// CHECK-ENCODING: [0xff,0xbf,0xe4,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05e4bfff \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/revd_z-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/revd_z-diagnostics.s new file mode 100644 index 0000000000000..ed031e4a8763d --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/revd_z-diagnostics.s @@ -0,0 +1,56 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 2>&1 < %s| FileCheck %s + +// ------------------------------------------------------------------------- // +// Invalid predicate + +revd z0.q, p8/z, z0.q +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: revd z0.q, p8/z, z0.q +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid element widths + +revd z0.b, p7/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revd z0.b, p7/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revd z0.h, p7/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revd z0.h, p7/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revd z0.s, p7/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revd z0.s, p7/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revd z0.h, p7/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revd z0.h, p7/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revd z0.s, p7/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revd z0.s, p7/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revd z0.d, p7/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revd z0.d, p7/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revd z0.q, p7/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revd z0.q, p7/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Negative tests for instructions that are incompatible with movprfx + +movprfx z0, z7 +revd z0.q, p0/z, z0.q +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: revd z0.q, p0/z, z0.q +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/revd_z.s b/llvm/test/MC/AArch64/SVE2p2/revd_z.s new file mode 100644 index 0000000000000..b8675f0268679 --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/revd_z.s @@ -0,0 +1,33 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sve - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +revd z0.q, p0/z, z0.q // 00000101-00101110-10100000-00000000 +// CHECK-INST: revd z0.q, p0/z, z0.q +// CHECK-ENCODING: [0x00,0xa0,0x2e,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 052ea000 + +revd z23.q, p3/z, z13.q // 00000101-00101110-10101101-10110111 +// CHECK-INST: revd z23.q, p3/z, z13.q +// CHECK-ENCODING: [0xb7,0xad,0x2e,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 052eadb7 + +revd z31.q, p7/z, z31.q // 00000101-00101110-10111111-11111111 +// CHECK-INST: revd z31.q, p7/z, z31.q +// CHECK-ENCODING: [0xff,0xbf,0x2e,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 052ebfff \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/revh_z-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/revh_z-diagnostics.s new file mode 100644 index 0000000000000..c7b83bdf229ec --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/revh_z-diagnostics.s @@ -0,0 +1,58 @@ + +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 2>&1 < %s| FileCheck %s +// ------------------------------------------------------------------------- // + +// Invalid predicate + +revh z0.s, p8/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: revh z0.s, p8/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revh z0.d, p8/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: revh z0.d, p8/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid element widths + +revh z0.b, p7/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revh z0.b, p7/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revh z0.h, p7/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revh z0.h, p7/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revh z0.q, p7/z, z0.q +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revh z0.q, p7/z, z0.q +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revh z0.s, p7/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revh z0.s, p7/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revh z0.d, p7/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revh z0.d, p7/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Negative tests for instructions that are incompatible with movprfx + +movprfx z0.s, p0/z, z7.s +revh z0.s, p0/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: revh z0.s, p0/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +movprfx z0, z7 +revh z0.d, p0/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: revh z0.d, p0/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/revh_z.s b/llvm/test/MC/AArch64/SVE2p2/revh_z.s new file mode 100644 index 0000000000000..2a56025bde916 --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/revh_z.s @@ -0,0 +1,33 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sve - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +revh z0.s, p0/z, z0.s // 00000101-10100101-10100000-00000000 +// CHECK-INST: revh z0.s, p0/z, z0.s +// CHECK-ENCODING: [0x00,0xa0,0xa5,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05a5a000 + +revh z23.s, p3/z, z13.s // 00000101-10100101-10101101-10110111 +// CHECK-INST: revh z23.s, p3/z, z13.s +// CHECK-ENCODING: [0xb7,0xad,0xa5,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05a5adb7 + +revh z31.d, p7/z, z31.d // 00000101-11100101-10111111-11111111 +// CHECK-INST: revh z31.d, p7/z, z31.d +// CHECK-ENCODING: [0xff,0xbf,0xe5,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05e5bfff \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/revw_z-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/revw_z-diagnostics.s new file mode 100644 index 0000000000000..478492dccafa4 --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/revw_z-diagnostics.s @@ -0,0 +1,51 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 2>&1 < %s| FileCheck %s +// ------------------------------------------------------------------------- // +// Invalid predicate + +revw z0.d, p8/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +// CHECK-NEXT: revw z0.d, p8/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Invalid element widths + +revw z0.b, p7/z, z0.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revw z0.b, p7/z, z0.b +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revw z0.h, p7/z, z0.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revw z0.h, p7/z, z0.h +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revw z0.s, p7/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revw z0.s, p7/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revw z0.q, p7/z, z0.q +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revw z0.q, p7/z, z0.q +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +revw z0.d, p7/z, z0.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +// CHECK-NEXT: revw z0.d, p7/z, z0.s +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +// --------------------------------------------------------------------------// +// Negative tests for instructions that are incompatible with movprfx + +movprfx z0.d, p0/z, z7.d +revw z0.d, p0/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: revw z0.d, p0/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: + +movprfx z0, z7 +revw z0.d, p0/z, z0.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov +// CHECK-NEXT: revw z0.d, p0/z, z0.d +// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}: \ No newline at end of file diff --git a/llvm/test/MC/AArch64/SVE2p2/revw_z.s b/llvm/test/MC/AArch64/SVE2p2/revw_z.s new file mode 100644 index 0000000000000..b695398098d5a --- /dev/null +++ b/llvm/test/MC/AArch64/SVE2p2/revw_z.s @@ -0,0 +1,33 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2p2 - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2 < %s \ +// RUN: | llvm-objdump -d --mattr=-sve - | FileCheck %s --check-prefix=CHECK-UNKNOWN +// Disassemble encoding and check the re-encoding (-show-encoding) matches. +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2 < %s \ +// RUN: | sed '/.text/d' | sed 's/.*encoding: //g' \ +// RUN: | llvm-mc -triple=aarch64 -mattr=+sve2p2 -disassemble -show-encoding \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST + +revw z0.d, p0/z, z0.d // 00000101-11100110-10100000-00000000 +// CHECK-INST: revw z0.d, p0/z, z0.d +// CHECK-ENCODING: [0x00,0xa0,0xe6,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05e6a000 + +revw z23.d, p3/z, z13.d // 00000101-11100110-10101101-10110111 +// CHECK-INST: revw z23.d, p3/z, z13.d +// CHECK-ENCODING: [0xb7,0xad,0xe6,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05e6adb7 + +revw z31.d, p7/z, z31.d // 00000101-11100110-10111111-11111111 +// CHECK-INST: revw z31.d, p7/z, z31.d +// CHECK-ENCODING: [0xff,0xbf,0xe6,0x05] +// CHECK-ERROR: instruction requires: sme2p2 or sve2p2 +// CHECK-UNKNOWN: 05e6bfff \ No newline at end of file diff --git a/llvm/test/MC/AArch64/adrp-auth-relocation.s b/llvm/test/MC/AArch64/adrp-auth-relocation.s new file mode 100644 index 0000000000000..57021c71632ff --- /dev/null +++ b/llvm/test/MC/AArch64/adrp-auth-relocation.s @@ -0,0 +1,12 @@ +// RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-readobj -r - | FileCheck %s +// RUN: not llvm-mc -triple=aarch64-linux-gnu_ilp32 -filetype=obj \ +// RUN: -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-ILP32 %s + +.text +adrp x0, :got_auth:sym + +.global sym +sym: + +// CHECK: R_AARCH64_AUTH_ADR_GOT_PAGE sym +// CHECK-ILP32: error: ILP32 ADRP AUTH relocation not supported (LP64 eqv: AUTH_ADR_GOT_PAGE) diff --git a/llvm/test/MC/AArch64/arm64-elf-relocs.s b/llvm/test/MC/AArch64/arm64-elf-relocs.s index 8813c4bd7d1aa..f679bb4c82827 100644 --- a/llvm/test/MC/AArch64/arm64-elf-relocs.s +++ b/llvm/test/MC/AArch64/arm64-elf-relocs.s @@ -81,13 +81,17 @@ // CHECK: adrp x15, :got:sym // CHECK-OBJ-LP64: 58 R_AARCH64_ADR_GOT_PAGE sym + adrp x15, :got_auth:sym +// CHECK: adrp x15, :got_auth:sym +// CHECK-OBJ-LP64: 5c R_AARCH64_AUTH_ADR_GOT_PAGE sym + adrp x29, :gottprel:sym // CHECK: adrp x29, :gottprel:sym -// CHECK-OBJ-LP64: 5c R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym +// CHECK-OBJ-LP64: 60 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym adrp x2, :tlsdesc:sym // CHECK: adrp x2, :tlsdesc:sym -// CHECK-OBJ-LP64: 60 R_AARCH64_TLSDESC_ADR_PAGE21 sym +// CHECK-OBJ-LP64: 64 R_AARCH64_TLSDESC_ADR_PAGE21 sym // LLVM is not competent enough to do this relocation because the // page boundary could occur anywhere after linking. A relocation @@ -96,7 +100,7 @@ .global trickQuestion trickQuestion: // CHECK: adrp x3, trickQuestion -// CHECK-OBJ-LP64: 64 R_AARCH64_ADR_PREL_PG_HI21 trickQuestion +// CHECK-OBJ-LP64: 68 R_AARCH64_ADR_PREL_PG_HI21 trickQuestion ldrb w2, [x3, :lo12:sym] ldrsb w5, [x7, #:lo12:sym] @@ -245,6 +249,16 @@ trickQuestion: // CHECK-OBJ-LP64: R_AARCH64_LD64_GOT_LO12_NC sym // CHECK-OBJ-LP64: R_AARCH64_LD64_GOT_LO12_NC sym+0x7 + ldr x24, [x23, #:got_auth_lo12:sym] + ldr d22, [x21, :got_auth_lo12:sym] + ldr x24, [x23, :got_auth_lo12:sym+7] +// CHECK: ldr x24, [x23, :got_auth_lo12:sym] +// CHECK: ldr d22, [x21, :got_auth_lo12:sym] +// CHECK: ldr x24, [x23, :got_auth_lo12:sym+7] +// CHECK-OBJ-LP64: R_AARCH64_AUTH_LD64_GOT_LO12_NC sym +// CHECK-OBJ-LP64: R_AARCH64_AUTH_LD64_GOT_LO12_NC sym +// CHECK-OBJ-LP64: R_AARCH64_AUTH_LD64_GOT_LO12_NC sym+0x7 + ldr x24, [x23, #:gotpage_lo15:sym] ldr d22, [x21, :gotpage_lo15:sym] ldr d22, [x23, :gotpage_lo15:sym+7] diff --git a/llvm/test/MC/AArch64/ilp32-diagnostics.s b/llvm/test/MC/AArch64/ilp32-diagnostics.s index 8a3bc1398e042..5d9c6e5626b2b 100644 --- a/llvm/test/MC/AArch64/ilp32-diagnostics.s +++ b/llvm/test/MC/AArch64/ilp32-diagnostics.s @@ -69,6 +69,12 @@ ldr x10, [x0, #:gottprel_lo12:var] ldr x24, [x23, #:got_lo12:sym] // ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: LD64_GOT_LO12_NC) +ldr x24, [x23, #:got_auth_lo12:sym] +// ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: AUTH_GOT_LO12_NC) + +add x24, x23, #:got_auth_lo12:sym +// ERROR: [[#@LINE-1]]:1: error: ILP32 ADD AUTH relocation not supported (LP64 eqv: AUTH_GOT_ADD_LO12_NC) + ldr x24, [x23, :gottprel_lo12:sym] // ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC) diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2-fake16.s new file mode 100644 index 0000000000000..96dd572089436 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2-fake16.s @@ -0,0 +1,2554 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s + +v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo +// W32: encoding: [0x01,0x05,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v255, v2, vcc_lo +// W32: encoding: [0xff,0x05,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, s1, v2, vcc_lo +// W32: encoding: [0x01,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, s105, v2, vcc_lo +// W32: encoding: [0x69,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, vcc_lo, v2, vcc_lo +// W32: encoding: [0x6a,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, vcc_hi, v2, vcc_lo +// W32: encoding: [0x6b,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, ttmp15, v2, vcc_lo +// W32: encoding: [0x7b,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, m0, v2, vcc_lo +// W32: encoding: [0x7d,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, exec_lo, v2, vcc_lo +// W32: encoding: [0x7e,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, exec_hi, v2, vcc_lo +// W32: encoding: [0x7f,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, null, v2, vcc_lo +// W32: encoding: [0x7c,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, -1, v2, vcc_lo +// W32: encoding: [0xc1,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, 0.5, v2, vcc_lo +// W32: encoding: [0xf0,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, src_scc, v2, vcc_lo +// W32: encoding: [0xfd,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc_lo, 0xaf123456, v255, vcc_lo +// W32: encoding: [0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc +// W64: encoding: [0x01,0x05,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v255, v2, vcc +// W64: encoding: [0xff,0x05,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, s1, v2, vcc +// W64: encoding: [0x01,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, s105, v2, vcc +// W64: encoding: [0x69,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, vcc_lo, v2, vcc +// W64: encoding: [0x6a,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, vcc_hi, v2, vcc +// W64: encoding: [0x6b,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, ttmp15, v2, vcc +// W64: encoding: [0x7b,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, m0, v2, vcc +// W64: encoding: [0x7d,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, exec_lo, v2, vcc +// W64: encoding: [0x7e,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, exec_hi, v2, vcc +// W64: encoding: [0x7f,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, null, v2, vcc +// W64: encoding: [0x7c,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, -1, v2, vcc +// W64: encoding: [0xc1,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, 0.5, v2, vcc +// W64: encoding: [0xf0,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, src_scc, v2, vcc +// W64: encoding: [0xfd,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc, 0xaf123456, v255, vcc +// W64: encoding: [0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x64] + +v_add_f16 v5, v127, v2 +// GFX11: encoding: [0x7f,0x05,0x0a,0x64] + +v_add_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x64] + +v_add_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x64] + +v_add_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x64] + +v_add_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x64] + +v_add_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x64] + +v_add_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x64] + +v_add_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x64] + +v_add_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x64] + +v_add_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x64] + +v_add_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x64] + +v_add_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x64] + +v_add_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x64] + +v_add_f16 v127, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00] + +v_add_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x06] + +v_add_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x06] + +v_add_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x06] + +v_add_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x06] + +v_add_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x06] + +v_add_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x06] + +v_add_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x06] + +v_add_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x06] + +v_add_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x06] + +v_add_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x06] + +v_add_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x06] + +v_add_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x06] + +v_add_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x06] + +v_add_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x06] + +v_add_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x07,0x56,0x34,0x12,0xaf] + +v_add_nc_u32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x4a] + +v_add_nc_u32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x4a] + +v_add_nc_u32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x4a] + +v_add_nc_u32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x4b,0x56,0x34,0x12,0xaf] + +v_and_b32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x36] + +v_and_b32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x36] + +v_and_b32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x36] + +v_and_b32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x36] + +v_and_b32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x36] + +v_and_b32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x36] + +v_and_b32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x36] + +v_and_b32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x36] + +v_and_b32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x36] + +v_and_b32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x36] + +v_and_b32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x36] + +v_and_b32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x36] + +v_and_b32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x36] + +v_and_b32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x36] + +v_and_b32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x37,0x56,0x34,0x12,0xaf] + +v_ashrrev_i32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x34] + +v_ashrrev_i32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x34] + +v_ashrrev_i32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x34] + +v_ashrrev_i32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x35,0x56,0x34,0x12,0xaf] + +v_cndmask_b32 v5, v1, v2, vcc_lo +// W32: encoding: [0x01,0x05,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v255, v2, vcc_lo +// W32: encoding: [0xff,0x05,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, s1, v2, vcc_lo +// W32: encoding: [0x01,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, s105, v2, vcc_lo +// W32: encoding: [0x69,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, vcc_lo, v2, vcc_lo +// W32: encoding: [0x6a,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, vcc_hi, v2, vcc_lo +// W32: encoding: [0x6b,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, ttmp15, v2, vcc_lo +// W32: encoding: [0x7b,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, m0, v2, vcc_lo +// W32: encoding: [0x7d,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, exec_lo, v2, vcc_lo +// W32: encoding: [0x7e,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, exec_hi, v2, vcc_lo +// W32: encoding: [0x7f,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, null, v2, vcc_lo +// W32: encoding: [0x7c,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, -1, v2, vcc_lo +// W32: encoding: [0xc1,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, 0.5, v2, vcc_lo +// W32: encoding: [0xf0,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, src_scc, v2, vcc_lo +// W32: encoding: [0xfd,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, 0xaf123456, v255, vcc_lo +// W32: encoding: [0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc +// W64: encoding: [0x01,0x05,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v255, v2, vcc +// W64: encoding: [0xff,0x05,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, s1, v2, vcc +// W64: encoding: [0x01,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, s105, v2, vcc +// W64: encoding: [0x69,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, vcc_lo, v2, vcc +// W64: encoding: [0x6a,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, vcc_hi, v2, vcc +// W64: encoding: [0x6b,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, ttmp15, v2, vcc +// W64: encoding: [0x7b,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, m0, v2, vcc +// W64: encoding: [0x7d,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, exec_lo, v2, vcc +// W64: encoding: [0x7e,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, exec_hi, v2, vcc +// W64: encoding: [0x7f,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, null, v2, vcc +// W64: encoding: [0x7c,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, -1, v2, vcc +// W64: encoding: [0xc1,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, 0.5, v2, vcc +// W64: encoding: [0xf0,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, src_scc, v2, vcc +// W64: encoding: [0xfd,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, 0xaf123456, v255, vcc +// W64: encoding: [0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf] + +v_dot2acc_f32_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x04] + +v_dot2acc_f32_f16 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x04] + +v_dot2acc_f32_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x04] + +v_dot2acc_f32_f16 v255, 0xfe0b, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x05,0x0b,0xfe,0x00,0x00] + +v_dot2c_f32_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x04] + +v_dot2c_f32_f16 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x04] + +v_dot2c_f32_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x04] + +v_dot2c_f32_f16 v255, 0xfe0b, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x05,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, v1, v2, 0xfe0b +// GFX11: encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, v127, v2, 0xfe0b +// GFX11: encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, s1, v2, 0xfe0b +// GFX11: encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, s105, v2, 0xfe0b +// GFX11: encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b +// GFX11: encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b +// GFX11: encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, ttmp15, v2, 0xfe0b +// GFX11: encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, m0, v2, 0xfe0b +// GFX11: encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, exec_lo, v2, 0xfe0b +// GFX11: encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, exec_hi, v2, 0xfe0b +// GFX11: encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, null, v2, 0xfe0b +// GFX11: encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, -1, v2, 0xfe0b +// GFX11: encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, 0.5, v2, 0xfe0b +// GFX11: encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, src_scc, v2, 0xfe0b +// GFX11: encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b +// GFX11: encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f32 v5, v1, v2, 0xaf123456 +// GFX11: encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, v255, v2, 0xaf123456 +// GFX11: encoding: [0xff,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, s1, v2, 0xaf123456 +// GFX11: encoding: [0x01,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, s105, v2, 0xaf123456 +// GFX11: encoding: [0x69,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, vcc_lo, v2, 0xaf123456 +// GFX11: encoding: [0x6a,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, vcc_hi, v2, 0xaf123456 +// GFX11: encoding: [0x6b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, ttmp15, v2, 0xaf123456 +// GFX11: encoding: [0x7b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, m0, v2, 0xaf123456 +// GFX11: encoding: [0x7d,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, exec_lo, v2, 0xaf123456 +// GFX11: encoding: [0x7e,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, exec_hi, v2, 0xaf123456 +// GFX11: encoding: [0x7f,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, null, v2, 0xaf123456 +// GFX11: encoding: [0x7c,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, -1, v2, 0xaf123456 +// GFX11: encoding: [0xc1,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, 0.5, v2, 0xaf123456 +// GFX11: encoding: [0xf0,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, src_scc, v2, 0xaf123456 +// GFX11: encoding: [0xfd,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 +// GFX11: encoding: [0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf] + +v_fmac_dx9_zero_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x0c] + +v_fmac_dx9_zero_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x0d,0x56,0x34,0x12,0xaf] + +v_fmac_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x6c] + +v_fmac_f16 v5, v127, v2 +// GFX11: encoding: [0x7f,0x05,0x0a,0x6c] + +v_fmac_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x6c] + +v_fmac_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x6c] + +v_fmac_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x6c] + +v_fmac_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x6c] + +v_fmac_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x6c] + +v_fmac_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x6c] + +v_fmac_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x6c] + +v_fmac_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x6c] + +v_fmac_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x6c] + +v_fmac_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x6c] + +v_fmac_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x6c] + +v_fmac_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x6c] + +v_fmac_f16 v127, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00] + +v_fmac_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x56] + +v_fmac_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x56] + +v_fmac_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x56] + +v_fmac_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x56] + +v_fmac_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x56] + +v_fmac_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x56] + +v_fmac_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x56] + +v_fmac_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x56] + +v_fmac_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x56] + +v_fmac_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x56] + +v_fmac_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x56] + +v_fmac_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x56] + +v_fmac_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x56] + +v_fmac_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x56] + +v_fmac_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf] + +v_fmac_legacy_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x0c] + +v_fmac_legacy_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x0c] + +v_fmac_legacy_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x0c] + +v_fmac_legacy_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x0d,0x56,0x34,0x12,0xaf] + +v_fmamk_f16 v5, v1, 0xfe0b, v3 +// GFX11: encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, v127, 0xfe0b, v3 +// GFX11: encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, s1, 0xfe0b, v3 +// GFX11: encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, s105, 0xfe0b, v3 +// GFX11: encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3 +// GFX11: encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3 +// GFX11: encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, ttmp15, 0xfe0b, v3 +// GFX11: encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, m0, 0xfe0b, v3 +// GFX11: encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, exec_lo, 0xfe0b, v3 +// GFX11: encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, exec_hi, 0xfe0b, v3 +// GFX11: encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, null, 0xfe0b, v3 +// GFX11: encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, -1, 0xfe0b, v3 +// GFX11: encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, 0.5, 0xfe0b, v3 +// GFX11: encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, src_scc, 0xfe0b, v3 +// GFX11: encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f32 v5, v1, 0xaf123456, v3 +// GFX11: encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, v255, 0xaf123456, v3 +// GFX11: encoding: [0xff,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, s1, 0xaf123456, v3 +// GFX11: encoding: [0x01,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, s105, 0xaf123456, v3 +// GFX11: encoding: [0x69,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, vcc_lo, 0xaf123456, v3 +// GFX11: encoding: [0x6a,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, vcc_hi, 0xaf123456, v3 +// GFX11: encoding: [0x6b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, ttmp15, 0xaf123456, v3 +// GFX11: encoding: [0x7b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, m0, 0xaf123456, v3 +// GFX11: encoding: [0x7d,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, exec_lo, 0xaf123456, v3 +// GFX11: encoding: [0x7e,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, exec_hi, 0xaf123456, v3 +// GFX11: encoding: [0x7f,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, null, 0xaf123456, v3 +// GFX11: encoding: [0x7c,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, -1, 0xaf123456, v3 +// GFX11: encoding: [0xc1,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, 0.5, 0xaf123456, v3 +// GFX11: encoding: [0xf0,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, src_scc, 0xaf123456, v3 +// GFX11: encoding: [0xfd,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf] + +v_ldexp_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x76] + +v_ldexp_f16 v5, v127, v2 +// GFX11: encoding: [0x7f,0x05,0x0a,0x76] + +v_ldexp_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x76] + +v_ldexp_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x76] + +v_ldexp_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x76] + +v_ldexp_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x76] + +v_ldexp_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x76] + +v_ldexp_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x76] + +v_ldexp_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x76] + +v_ldexp_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x76] + +v_ldexp_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x76] + +v_ldexp_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x76] + +v_ldexp_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x76] + +v_ldexp_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x76] + +v_ldexp_f16 v127, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] + +v_lshlrev_b32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x30] + +v_lshlrev_b32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x30] + +v_lshlrev_b32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x30] + +v_lshlrev_b32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x31,0x56,0x34,0x12,0xaf] + +v_lshrrev_b32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x32] + +v_lshrrev_b32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x32] + +v_lshrrev_b32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x32] + +v_lshrrev_b32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x33,0x56,0x34,0x12,0xaf] + +v_max_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x72] + +v_max_f16 v5, v127, v2 +// GFX11: encoding: [0x7f,0x05,0x0a,0x72] + +v_max_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x72] + +v_max_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x72] + +v_max_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x72] + +v_max_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x72] + +v_max_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x72] + +v_max_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x72] + +v_max_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x72] + +v_max_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x72] + +v_max_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x72] + +v_max_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x72] + +v_max_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x72] + +v_max_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x72] + +v_max_f16 v127, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x72,0x0b,0xfe,0x00,0x00] + +v_max_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x20] + +v_max_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x20] + +v_max_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x20] + +v_max_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x20] + +v_max_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x20] + +v_max_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x20] + +v_max_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x20] + +v_max_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x20] + +v_max_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x20] + +v_max_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x20] + +v_max_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x20] + +v_max_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x20] + +v_max_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x20] + +v_max_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x20] + +v_max_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x21,0x56,0x34,0x12,0xaf] + +v_max_i32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x24] + +v_max_i32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x24] + +v_max_i32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x24] + +v_max_i32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x24] + +v_max_i32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x24] + +v_max_i32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x24] + +v_max_i32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x24] + +v_max_i32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x24] + +v_max_i32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x24] + +v_max_i32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x24] + +v_max_i32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x24] + +v_max_i32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x24] + +v_max_i32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x24] + +v_max_i32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x24] + +v_max_i32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x25,0x56,0x34,0x12,0xaf] + +v_max_u32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x28] + +v_max_u32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x28] + +v_max_u32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x28] + +v_max_u32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x28] + +v_max_u32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x28] + +v_max_u32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x28] + +v_max_u32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x28] + +v_max_u32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x28] + +v_max_u32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x28] + +v_max_u32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x28] + +v_max_u32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x28] + +v_max_u32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x28] + +v_max_u32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x28] + +v_max_u32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x28] + +v_max_u32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x29,0x56,0x34,0x12,0xaf] + +v_min_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x74] + +v_min_f16 v5, v127, v2 +// GFX11: encoding: [0x7f,0x05,0x0a,0x74] + +v_min_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x74] + +v_min_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x74] + +v_min_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x74] + +v_min_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x74] + +v_min_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x74] + +v_min_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x74] + +v_min_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x74] + +v_min_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x74] + +v_min_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x74] + +v_min_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x74] + +v_min_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x74] + +v_min_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x74] + +v_min_f16 v127, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x74,0x0b,0xfe,0x00,0x00] + +v_min_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x1e] + +v_min_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x1e] + +v_min_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x1e] + +v_min_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x1e] + +v_min_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x1e] + +v_min_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x1e] + +v_min_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x1e] + +v_min_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x1e] + +v_min_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x1e] + +v_min_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x1e] + +v_min_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x1e] + +v_min_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x1e] + +v_min_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x1e] + +v_min_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x1e] + +v_min_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x1f,0x56,0x34,0x12,0xaf] + +v_min_i32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x22] + +v_min_i32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x22] + +v_min_i32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x22] + +v_min_i32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x22] + +v_min_i32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x22] + +v_min_i32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x22] + +v_min_i32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x22] + +v_min_i32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x22] + +v_min_i32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x22] + +v_min_i32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x22] + +v_min_i32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x22] + +v_min_i32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x22] + +v_min_i32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x22] + +v_min_i32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x22] + +v_min_i32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x23,0x56,0x34,0x12,0xaf] + +v_min_u32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x26] + +v_min_u32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x26] + +v_min_u32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x26] + +v_min_u32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x26] + +v_min_u32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x26] + +v_min_u32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x26] + +v_min_u32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x26] + +v_min_u32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x26] + +v_min_u32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x26] + +v_min_u32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x26] + +v_min_u32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x26] + +v_min_u32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x26] + +v_min_u32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x26] + +v_min_u32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x26] + +v_min_u32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x27,0x56,0x34,0x12,0xaf] + +v_mul_dx9_zero_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf] + +v_mul_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x6a] + +v_mul_f16 v5, v127, v2 +// GFX11: encoding: [0x7f,0x05,0x0a,0x6a] + +v_mul_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x6a] + +v_mul_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x6a] + +v_mul_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x6a] + +v_mul_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x6a] + +v_mul_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x6a] + +v_mul_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x6a] + +v_mul_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x6a] + +v_mul_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x6a] + +v_mul_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x6a] + +v_mul_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x6a] + +v_mul_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x6a] + +v_mul_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x6a] + +v_mul_f16 v127, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00] + +v_mul_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x10] + +v_mul_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x10] + +v_mul_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x10] + +v_mul_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x10] + +v_mul_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x10] + +v_mul_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x10] + +v_mul_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x10] + +v_mul_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x10] + +v_mul_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x10] + +v_mul_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x10] + +v_mul_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x10] + +v_mul_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x10] + +v_mul_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x10] + +v_mul_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x10] + +v_mul_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x11,0x56,0x34,0x12,0xaf] + +v_mul_hi_i32_i24 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x14] + +v_mul_hi_i32_i24 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x14] + +v_mul_hi_i32_i24 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x15,0x56,0x34,0x12,0xaf] + +v_mul_hi_u32_u24 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x18] + +v_mul_hi_u32_u24 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x18] + +v_mul_hi_u32_u24 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x19,0x56,0x34,0x12,0xaf] + +v_mul_i32_i24 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x12] + +v_mul_i32_i24 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x12] + +v_mul_i32_i24 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x12] + +v_mul_i32_i24 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x13,0x56,0x34,0x12,0xaf] + +v_mul_legacy_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x0e] + +v_mul_legacy_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x0e] + +v_mul_legacy_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf] + +v_mul_u32_u24 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x16] + +v_mul_u32_u24 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x16] + +v_mul_u32_u24 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x16] + +v_mul_u32_u24 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x17,0x56,0x34,0x12,0xaf] + +v_or_b32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x38] + +v_or_b32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x38] + +v_or_b32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x38] + +v_or_b32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x38] + +v_or_b32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x38] + +v_or_b32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x38] + +v_or_b32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x38] + +v_or_b32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x38] + +v_or_b32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x38] + +v_or_b32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x38] + +v_or_b32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x38] + +v_or_b32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x38] + +v_or_b32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x38] + +v_or_b32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x38] + +v_or_b32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x39,0x56,0x34,0x12,0xaf] + +v_pk_fmac_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x78] + +v_pk_fmac_f16 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x78] + +v_pk_fmac_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x78] + +v_pk_fmac_f16 v255, 0xfe0b, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x79,0x0b,0xfe,0x00,0x00] + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo +// W32: encoding: [0x01,0x05,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v255, v2, vcc_lo +// W32: encoding: [0xff,0x05,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, s1, v2, vcc_lo +// W32: encoding: [0x01,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, s105, v2, vcc_lo +// W32: encoding: [0x69,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, vcc_lo, v2, vcc_lo +// W32: encoding: [0x6a,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, vcc_hi, v2, vcc_lo +// W32: encoding: [0x6b,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, ttmp15, v2, vcc_lo +// W32: encoding: [0x7b,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, m0, v2, vcc_lo +// W32: encoding: [0x7d,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, exec_lo, v2, vcc_lo +// W32: encoding: [0x7e,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, exec_hi, v2, vcc_lo +// W32: encoding: [0x7f,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, null, v2, vcc_lo +// W32: encoding: [0x7c,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, -1, v2, vcc_lo +// W32: encoding: [0xc1,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, 0.5, v2, vcc_lo +// W32: encoding: [0xf0,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, src_scc, v2, vcc_lo +// W32: encoding: [0xfd,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc_lo, 0xaf123456, v255, vcc_lo +// W32: encoding: [0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc +// W64: encoding: [0x01,0x05,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v255, v2, vcc +// W64: encoding: [0xff,0x05,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, s1, v2, vcc +// W64: encoding: [0x01,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, s105, v2, vcc +// W64: encoding: [0x69,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, vcc_lo, v2, vcc +// W64: encoding: [0x6a,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, vcc_hi, v2, vcc +// W64: encoding: [0x6b,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, ttmp15, v2, vcc +// W64: encoding: [0x7b,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, m0, v2, vcc +// W64: encoding: [0x7d,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, exec_lo, v2, vcc +// W64: encoding: [0x7e,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, exec_hi, v2, vcc +// W64: encoding: [0x7f,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, null, v2, vcc +// W64: encoding: [0x7c,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, -1, v2, vcc +// W64: encoding: [0xc1,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, 0.5, v2, vcc +// W64: encoding: [0xf0,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, src_scc, v2, vcc +// W64: encoding: [0xfd,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc, 0xaf123456, v255, vcc +// W64: encoding: [0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x66] + +v_sub_f16 v5, v127, v2 +// GFX11: encoding: [0x7f,0x05,0x0a,0x66] + +v_sub_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x66] + +v_sub_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x66] + +v_sub_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x66] + +v_sub_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x66] + +v_sub_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x66] + +v_sub_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x66] + +v_sub_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x66] + +v_sub_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x66] + +v_sub_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x66] + +v_sub_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x66] + +v_sub_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x66] + +v_sub_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x66] + +v_sub_f16 v127, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00] + +v_sub_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x08] + +v_sub_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x08] + +v_sub_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x08] + +v_sub_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x08] + +v_sub_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x08] + +v_sub_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x08] + +v_sub_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x08] + +v_sub_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x08] + +v_sub_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x08] + +v_sub_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x08] + +v_sub_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x08] + +v_sub_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x08] + +v_sub_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x08] + +v_sub_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x08] + +v_sub_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x09,0x56,0x34,0x12,0xaf] + +v_sub_nc_u32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x4c] + +v_sub_nc_u32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x4c] + +v_sub_nc_u32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x4c] + +v_sub_nc_u32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x4d,0x56,0x34,0x12,0xaf] + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo +// W32: encoding: [0x01,0x05,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v255, v2, vcc_lo +// W32: encoding: [0xff,0x05,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, s1, v2, vcc_lo +// W32: encoding: [0x01,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, s105, v2, vcc_lo +// W32: encoding: [0x69,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, vcc_lo, v2, vcc_lo +// W32: encoding: [0x6a,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, vcc_hi, v2, vcc_lo +// W32: encoding: [0x6b,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, ttmp15, v2, vcc_lo +// W32: encoding: [0x7b,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, m0, v2, vcc_lo +// W32: encoding: [0x7d,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, exec_lo, v2, vcc_lo +// W32: encoding: [0x7e,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, exec_hi, v2, vcc_lo +// W32: encoding: [0x7f,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, null, v2, vcc_lo +// W32: encoding: [0x7c,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, -1, v2, vcc_lo +// W32: encoding: [0xc1,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, 0.5, v2, vcc_lo +// W32: encoding: [0xf0,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, src_scc, v2, vcc_lo +// W32: encoding: [0xfd,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc_lo, 0xaf123456, v255, vcc_lo +// W32: encoding: [0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc +// W64: encoding: [0x01,0x05,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v255, v2, vcc +// W64: encoding: [0xff,0x05,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, s1, v2, vcc +// W64: encoding: [0x01,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, s105, v2, vcc +// W64: encoding: [0x69,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, vcc_lo, v2, vcc +// W64: encoding: [0x6a,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, vcc_hi, v2, vcc +// W64: encoding: [0x6b,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, ttmp15, v2, vcc +// W64: encoding: [0x7b,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, m0, v2, vcc +// W64: encoding: [0x7d,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, exec_lo, v2, vcc +// W64: encoding: [0x7e,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, exec_hi, v2, vcc +// W64: encoding: [0x7f,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, null, v2, vcc +// W64: encoding: [0x7c,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, -1, v2, vcc +// W64: encoding: [0xc1,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, 0.5, v2, vcc +// W64: encoding: [0xf0,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, src_scc, v2, vcc +// W64: encoding: [0xfd,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc, 0xaf123456, v255, vcc +// W64: encoding: [0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_f16 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x68] + +v_subrev_f16 v5, v127, v2 +// GFX11: encoding: [0x7f,0x05,0x0a,0x68] + +v_subrev_f16 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x68] + +v_subrev_f16 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x68] + +v_subrev_f16 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x68] + +v_subrev_f16 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x68] + +v_subrev_f16 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x68] + +v_subrev_f16 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x68] + +v_subrev_f16 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x68] + +v_subrev_f16 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x68] + +v_subrev_f16 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x68] + +v_subrev_f16 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x68] + +v_subrev_f16 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x68] + +v_subrev_f16 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x68] + +v_subrev_f16 v127, 0xfe0b, v127 +// GFX11: encoding: [0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00] + +v_subrev_f32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x0a] + +v_subrev_f32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x0a] + +v_subrev_f32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x0a] + +v_subrev_f32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x0a] + +v_subrev_f32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x0a] + +v_subrev_f32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x0a] + +v_subrev_f32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x0a] + +v_subrev_f32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x0a] + +v_subrev_f32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x0a] + +v_subrev_f32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x0a] + +v_subrev_f32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x0a] + +v_subrev_f32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x0a] + +v_subrev_f32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x0a] + +v_subrev_f32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x0a] + +v_subrev_f32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x0b,0x56,0x34,0x12,0xaf] + +v_subrev_nc_u32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x4e] + +v_subrev_nc_u32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x4e] + +v_subrev_nc_u32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x4f,0x56,0x34,0x12,0xaf] + +v_xnor_b32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x3c] + +v_xnor_b32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x3c] + +v_xnor_b32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x3c] + +v_xnor_b32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x3c] + +v_xnor_b32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x3c] + +v_xnor_b32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x3c] + +v_xnor_b32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x3c] + +v_xnor_b32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x3c] + +v_xnor_b32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x3c] + +v_xnor_b32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x3c] + +v_xnor_b32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x3c] + +v_xnor_b32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x3c] + +v_xnor_b32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x3c] + +v_xnor_b32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x3c] + +v_xnor_b32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x3d,0x56,0x34,0x12,0xaf] + +v_xor_b32 v5, v1, v2 +// GFX11: encoding: [0x01,0x05,0x0a,0x3a] + +v_xor_b32 v5, v255, v2 +// GFX11: encoding: [0xff,0x05,0x0a,0x3a] + +v_xor_b32 v5, s1, v2 +// GFX11: encoding: [0x01,0x04,0x0a,0x3a] + +v_xor_b32 v5, s105, v2 +// GFX11: encoding: [0x69,0x04,0x0a,0x3a] + +v_xor_b32 v5, vcc_lo, v2 +// GFX11: encoding: [0x6a,0x04,0x0a,0x3a] + +v_xor_b32 v5, vcc_hi, v2 +// GFX11: encoding: [0x6b,0x04,0x0a,0x3a] + +v_xor_b32 v5, ttmp15, v2 +// GFX11: encoding: [0x7b,0x04,0x0a,0x3a] + +v_xor_b32 v5, m0, v2 +// GFX11: encoding: [0x7d,0x04,0x0a,0x3a] + +v_xor_b32 v5, exec_lo, v2 +// GFX11: encoding: [0x7e,0x04,0x0a,0x3a] + +v_xor_b32 v5, exec_hi, v2 +// GFX11: encoding: [0x7f,0x04,0x0a,0x3a] + +v_xor_b32 v5, null, v2 +// GFX11: encoding: [0x7c,0x04,0x0a,0x3a] + +v_xor_b32 v5, -1, v2 +// GFX11: encoding: [0xc1,0x04,0x0a,0x3a] + +v_xor_b32 v5, 0.5, v2 +// GFX11: encoding: [0xf0,0x04,0x0a,0x3a] + +v_xor_b32 v5, src_scc, v2 +// GFX11: encoding: [0xfd,0x04,0x0a,0x3a] + +v_xor_b32 v255, 0xaf123456, v255 +// GFX11: encoding: [0xff,0xfe,0xff,0x3b,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s index fb300b2e94972..2a4b3ea201701 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo // W32: encoding: [0x01,0x05,0x0a,0x40] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16-fake16.s new file mode 100644 index 0000000000000..6b9092f501e5a --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16-fake16.s @@ -0,0 +1,2114 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s + +v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_half_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:1 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:15 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:1 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:15 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:1 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:15 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W32: encoding: [0xfa,0xfe,0xff,0x41,0xff,0x6f,0x05,0x30] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_half_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:1 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:15 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:1 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:15 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:1 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:15 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W64: encoding: [0xfa,0xfe,0xff,0x41,0xff,0x6f,0x05,0x30] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] + +v_add_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff] + +v_add_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff] + +v_add_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff] + +v_add_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff] + +v_add_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff] + +v_add_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff] + +v_add_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff] + +v_add_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff] + +v_add_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff] + +v_add_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff] + +v_add_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01] + +v_add_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x60,0x09,0x13] + +v_add_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xf5,0x30] + +v_add_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff] + +v_add_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xff] + +v_add_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0xff] + +v_add_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0xff] + +v_add_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0xff] + +v_add_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0xff] + +v_add_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0xff] + +v_add_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0xff] + +v_add_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0xff] + +v_add_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0xff] + +v_add_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x50,0x01,0xff] + +v_add_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x5f,0x01,0x01] + +v_add_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x60,0x09,0x13] + +v_add_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x07,0xff,0x6f,0xf5,0x30] + +v_add_nc_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1b,0x00,0xff] + +v_add_nc_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0xff] + +v_add_nc_u32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x40,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x41,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x01,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x0f,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x11,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1f,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x21,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x2f,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x50,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x5f,0x01,0x01] + +v_add_nc_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x60,0x09,0x13] + +v_add_nc_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x4b,0xff,0x6f,0x05,0x30] + +v_and_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1b,0x00,0xff] + +v_and_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0xe4,0x00,0xff] + +v_and_b32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x40,0x01,0xff] + +v_and_b32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x41,0x01,0xff] + +v_and_b32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x01,0x01,0xff] + +v_and_b32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x0f,0x01,0xff] + +v_and_b32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x11,0x01,0xff] + +v_and_b32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1f,0x01,0xff] + +v_and_b32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x21,0x01,0xff] + +v_and_b32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x2f,0x01,0xff] + +v_and_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x50,0x01,0xff] + +v_and_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x5f,0x01,0x01] + +v_and_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x60,0x09,0x13] + +v_and_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x37,0xff,0x6f,0x05,0x30] + +v_ashrrev_i32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1b,0x00,0xff] + +v_ashrrev_i32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0xe4,0x00,0xff] + +v_ashrrev_i32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x40,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x41,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x01,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x0f,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x11,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1f,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x21,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x2f,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x50,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x5f,0x01,0x01] + +v_ashrrev_i32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x60,0x09,0x13] + +v_ashrrev_i32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x35,0xff,0x6f,0x05,0x30] + +v_cndmask_b32 v5, v1, v2, vcc_lo quad_perm:[3,2,1,0] +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo quad_perm:[0,1,2,3] +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_half_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_shl:1 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_shl:15 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_shr:1 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_shr:15 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_ror:1 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_ror:15 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, -v1, |v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x90,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, |v1|, -v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x60,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, -|v1|, -|v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W32: encoding: [0xfa,0xfe,0xff,0x03,0xff,0x6f,0x05,0x30] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc quad_perm:[3,2,1,0] +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc quad_perm:[0,1,2,3] +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_half_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_shl:1 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_shl:15 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_shr:1 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_shr:15 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_ror:1 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_ror:15 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W64: encoding: [0xfa,0xfe,0xff,0x03,0xff,0x6f,0x05,0x30] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32_dpp v5, -v1, |v2|, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x90,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32_dpp v5, |v1|, -v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x60,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x60,0x09,0x13] + +v_cvt_pk_rtz_f16_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xf5,0x30] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x60,0x09,0x13] + +v_cvt_pkrtz_f16_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xf5,0x30] + +v_dot2acc_f32_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x1b,0x00,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x40,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x41,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x01,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x0f,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x11,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x1f,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x21,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x2f,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x50,0x01,0xff] + +v_dot2acc_f32_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x5f,0x01,0x01] + +v_dot2acc_f32_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x60,0x09,0x13] + +v_dot2acc_f32_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x05,0xff,0x6f,0xf5,0x30] + +v_dot2c_f32_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x1b,0x00,0xff] + +v_dot2c_f32_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x40,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x41,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x01,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x0f,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x11,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x1f,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x21,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x2f,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x50,0x01,0xff] + +v_dot2c_f32_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x5f,0x01,0x01] + +v_dot2c_f32_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x04,0x01,0x60,0x09,0x13] + +v_dot2c_f32_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x05,0xff,0x6f,0xf5,0x30] + +v_fmac_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff] + +v_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff] + +v_fmac_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01] + +v_fmac_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x09,0x13] + +v_fmac_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xf5,0x30] + +v_fmac_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff] + +v_fmac_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xff] + +v_fmac_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x50,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x5f,0x01,0x01] + +v_fmac_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x60,0x09,0x13] + +v_fmac_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x57,0xff,0x6f,0xf5,0x30] + +v_ldexp_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff] + +v_ldexp_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff] + +v_ldexp_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01] + +v_ldexp_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x60,0x09,0x13] + +v_ldexp_f16 v127, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x35,0x30] + +v_lshlrev_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1b,0x00,0xff] + +v_lshlrev_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0xe4,0x00,0xff] + +v_lshlrev_b32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x40,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x41,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x01,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x0f,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x11,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1f,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x21,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x2f,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x50,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x5f,0x01,0x01] + +v_lshlrev_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x60,0x09,0x13] + +v_lshlrev_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x31,0xff,0x6f,0x05,0x30] + +v_lshrrev_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1b,0x00,0xff] + +v_lshrrev_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0xe4,0x00,0xff] + +v_lshrrev_b32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x40,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x41,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x01,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x0f,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x11,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1f,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x21,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x2f,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x50,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x5f,0x01,0x01] + +v_lshrrev_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x60,0x09,0x13] + +v_lshrrev_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x33,0xff,0x6f,0x05,0x30] + +v_max_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x1b,0x00,0xff] + +v_max_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0xe4,0x00,0xff] + +v_max_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x40,0x01,0xff] + +v_max_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x41,0x01,0xff] + +v_max_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x01,0x01,0xff] + +v_max_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x0f,0x01,0xff] + +v_max_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x11,0x01,0xff] + +v_max_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x1f,0x01,0xff] + +v_max_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x21,0x01,0xff] + +v_max_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x2f,0x01,0xff] + +v_max_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x50,0x01,0xff] + +v_max_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x5f,0x01,0x01] + +v_max_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x72,0x01,0x60,0x09,0x13] + +v_max_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xfe,0x72,0x7f,0x6f,0xf5,0x30] + +v_max_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x1b,0x00,0xff] + +v_max_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0xff] + +v_max_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x40,0x01,0xff] + +v_max_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x41,0x01,0xff] + +v_max_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x01,0x01,0xff] + +v_max_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x0f,0x01,0xff] + +v_max_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x11,0x01,0xff] + +v_max_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x1f,0x01,0xff] + +v_max_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x21,0x01,0xff] + +v_max_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x2f,0x01,0xff] + +v_max_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x50,0x01,0xff] + +v_max_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x5f,0x01,0x01] + +v_max_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x20,0x01,0x60,0x09,0x13] + +v_max_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x21,0xff,0x6f,0xf5,0x30] + +v_max_i32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0xff] + +v_max_i32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xff] + +v_max_i32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0xff] + +v_max_i32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0xff] + +v_max_i32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0xff] + +v_max_i32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0xff] + +v_max_i32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0xff] + +v_max_i32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0xff] + +v_max_i32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0xff] + +v_max_i32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0xff] + +v_max_i32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x50,0x01,0xff] + +v_max_i32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x5f,0x01,0x01] + +v_max_i32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x60,0x09,0x13] + +v_max_i32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x25,0xff,0x6f,0x05,0x30] + +v_max_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0xff] + +v_max_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xff] + +v_max_u32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0xff] + +v_max_u32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0xff] + +v_max_u32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0xff] + +v_max_u32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0xff] + +v_max_u32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0xff] + +v_max_u32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0xff] + +v_max_u32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0xff] + +v_max_u32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0xff] + +v_max_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x50,0x01,0xff] + +v_max_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x5f,0x01,0x01] + +v_max_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x60,0x09,0x13] + +v_max_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x29,0xff,0x6f,0x05,0x30] + +v_min_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x1b,0x00,0xff] + +v_min_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0xe4,0x00,0xff] + +v_min_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x40,0x01,0xff] + +v_min_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x41,0x01,0xff] + +v_min_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x01,0x01,0xff] + +v_min_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x0f,0x01,0xff] + +v_min_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x11,0x01,0xff] + +v_min_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x1f,0x01,0xff] + +v_min_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x21,0x01,0xff] + +v_min_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x2f,0x01,0xff] + +v_min_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x50,0x01,0xff] + +v_min_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x5f,0x01,0x01] + +v_min_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x74,0x01,0x60,0x09,0x13] + +v_min_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xfe,0x74,0x7f,0x6f,0xf5,0x30] + +v_min_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x1b,0x00,0xff] + +v_min_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0xff] + +v_min_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x40,0x01,0xff] + +v_min_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x41,0x01,0xff] + +v_min_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x01,0x01,0xff] + +v_min_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x0f,0x01,0xff] + +v_min_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x11,0x01,0xff] + +v_min_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x1f,0x01,0xff] + +v_min_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x21,0x01,0xff] + +v_min_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x2f,0x01,0xff] + +v_min_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x50,0x01,0xff] + +v_min_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x5f,0x01,0x01] + +v_min_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x60,0x09,0x13] + +v_min_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x1f,0xff,0x6f,0xf5,0x30] + +v_min_i32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0xff] + +v_min_i32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xff] + +v_min_i32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0xff] + +v_min_i32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0xff] + +v_min_i32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0xff] + +v_min_i32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0xff] + +v_min_i32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0xff] + +v_min_i32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0xff] + +v_min_i32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0xff] + +v_min_i32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0xff] + +v_min_i32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x50,0x01,0xff] + +v_min_i32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x5f,0x01,0x01] + +v_min_i32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x60,0x09,0x13] + +v_min_i32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x23,0xff,0x6f,0x05,0x30] + +v_min_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0xff] + +v_min_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xff] + +v_min_u32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0xff] + +v_min_u32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0xff] + +v_min_u32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0xff] + +v_min_u32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0xff] + +v_min_u32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0xff] + +v_min_u32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0xff] + +v_min_u32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0xff] + +v_min_u32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0xff] + +v_min_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x50,0x01,0xff] + +v_min_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x5f,0x01,0x01] + +v_min_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x60,0x09,0x13] + +v_min_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x27,0xff,0x6f,0x05,0x30] + +v_mul_dx9_zero_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01] + +v_mul_dx9_zero_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x60,0x09,0x13] + +v_mul_dx9_zero_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xf5,0x30] + +v_mul_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff] + +v_mul_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff] + +v_mul_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01] + +v_mul_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x60,0x09,0x13] + +v_mul_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xf5,0x30] + +v_mul_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0xff] + +v_mul_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xff] + +v_mul_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x50,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x5f,0x01,0x01] + +v_mul_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x60,0x09,0x13] + +v_mul_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x11,0xff,0x6f,0xf5,0x30] + +v_mul_hi_i32_i24 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x50,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x5f,0x01,0x01] + +v_mul_hi_i32_i24 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x60,0x09,0x13] + +v_mul_hi_i32_i24 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x15,0xff,0x6f,0x05,0x30] + +v_mul_hi_u32_u24 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x50,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x5f,0x01,0x01] + +v_mul_hi_u32_u24 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x60,0x09,0x13] + +v_mul_hi_u32_u24 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x19,0xff,0x6f,0x05,0x30] + +v_mul_i32_i24 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0xff] + +v_mul_i32_i24 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xff] + +v_mul_i32_i24 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x50,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x5f,0x01,0x01] + +v_mul_i32_i24 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x60,0x09,0x13] + +v_mul_i32_i24 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x13,0xff,0x6f,0x05,0x30] + +v_mul_legacy_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff] + +v_mul_legacy_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff] + +v_mul_legacy_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01] + +v_mul_legacy_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x60,0x09,0x13] + +v_mul_legacy_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xf5,0x30] + +v_mul_u32_u24 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0xff] + +v_mul_u32_u24 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xff] + +v_mul_u32_u24 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x50,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x5f,0x01,0x01] + +v_mul_u32_u24 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x60,0x09,0x13] + +v_mul_u32_u24 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x17,0xff,0x6f,0x05,0x30] + +v_or_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0xff] + +v_or_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xff] + +v_or_b32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0xff] + +v_or_b32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0xff] + +v_or_b32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0xff] + +v_or_b32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0xff] + +v_or_b32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0xff] + +v_or_b32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0xff] + +v_or_b32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0xff] + +v_or_b32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0xff] + +v_or_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x50,0x01,0xff] + +v_or_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x5f,0x01,0x01] + +v_or_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x60,0x09,0x13] + +v_or_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x39,0xff,0x6f,0x05,0x30] + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_half_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:1 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:15 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:1 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:15 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:1 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:15 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W32: encoding: [0xfa,0xfe,0xff,0x43,0xff,0x6f,0x05,0x30] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_half_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:1 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:15 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:1 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:15 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:1 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:15 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W64: encoding: [0xfa,0xfe,0xff,0x43,0xff,0x6f,0x05,0x30] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff] + +v_sub_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff] + +v_sub_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01] + +v_sub_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x60,0x09,0x13] + +v_sub_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xf5,0x30] + +v_sub_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0xff] + +v_sub_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xff] + +v_sub_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x50,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x5f,0x01,0x01] + +v_sub_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x60,0x09,0x13] + +v_sub_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x09,0xff,0x6f,0xf5,0x30] + +v_sub_nc_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0xff] + +v_sub_nc_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xff] + +v_sub_nc_u32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x50,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x5f,0x01,0x01] + +v_sub_nc_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x60,0x09,0x13] + +v_sub_nc_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x4d,0xff,0x6f,0x05,0x30] + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_half_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:1 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:15 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:1 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:15 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:1 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:15 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W32: encoding: [0xfa,0xfe,0xff,0x45,0xff,0x6f,0x05,0x30] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_half_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:1 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:15 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:1 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:15 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:1 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:15 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W64: encoding: [0xfa,0xfe,0xff,0x45,0xff,0x6f,0x05,0x30] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff] + +v_subrev_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff] + +v_subrev_f16 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01] + +v_subrev_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x60,0x09,0x13] + +v_subrev_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xf5,0x30] + +v_subrev_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0xff] + +v_subrev_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xff] + +v_subrev_f32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x50,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x5f,0x01,0x01] + +v_subrev_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x60,0x09,0x13] + +v_subrev_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x0b,0xff,0x6f,0xf5,0x30] + +v_subrev_nc_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0xff] + +v_subrev_nc_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x50,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x5f,0x01,0x01] + +v_subrev_nc_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x60,0x09,0x13] + +v_subrev_nc_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x4f,0xff,0x6f,0x05,0x30] + +v_xnor_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0xff] + +v_xnor_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xff] + +v_xnor_b32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x50,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x5f,0x01,0x01] + +v_xnor_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x60,0x09,0x13] + +v_xnor_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x3d,0xff,0x6f,0x05,0x30] + +v_xor_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0xff] + +v_xor_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xff] + +v_xor_b32 v5, v1, v2 row_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_half_mirror +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_shl:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_shl:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_shr:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_shr:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_ror:1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_ror:15 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x50,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x5f,0x01,0x01] + +v_xor_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX11: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x60,0x09,0x13] + +v_xor_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX11: encoding: [0xfa,0xfe,0xff,0x3b,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s index 62c0deaecd96a..3eff00bb96e47 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] // W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8-fake16.s new file mode 100644 index 0000000000000..a4fea037a4de7 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8-fake16.s @@ -0,0 +1,451 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s + +v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: encoding: [0xea,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W32: encoding: [0xe9,0xfe,0xff,0x41,0xff,0x00,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: encoding: [0xea,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W64: encoding: [0xe9,0xfe,0xff,0x41,0xff,0x00,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] + +v_add_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] + +v_add_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00] + +v_add_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] + +v_add_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] + +v_add_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x07,0xff,0x00,0x00,0x00] + +v_add_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05] + +v_add_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05] + +v_add_nc_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x4b,0xff,0x00,0x00,0x00] + +v_and_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x36,0x01,0x77,0x39,0x05] + +v_and_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x36,0x01,0x77,0x39,0x05] + +v_and_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x37,0xff,0x00,0x00,0x00] + +v_ashrrev_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x34,0x01,0x77,0x39,0x05] + +v_ashrrev_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x34,0x01,0x77,0x39,0x05] + +v_ashrrev_i32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x35,0xff,0x00,0x00,0x00] + +v_cndmask_b32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: encoding: [0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: encoding: [0xea,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W32: encoding: [0xe9,0xfe,0xff,0x03,0xff,0x00,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: encoding: [0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: encoding: [0xea,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W64: encoding: [0xe9,0xfe,0xff,0x03,0xff,0x00,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] + +v_cvt_pk_rtz_f16_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] + +v_cvt_pkrtz_f16_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00] + +v_dot2acc_f32_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05] + +v_dot2acc_f32_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05] + +v_dot2acc_f32_f16 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x05,0xff,0x00,0x00,0x00] + +v_dot2c_f32_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05] + +v_dot2c_f32_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05] + +v_dot2c_f32_f16 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x05,0xff,0x00,0x00,0x00] + +v_fmac_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05] + +v_fmac_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05] + +v_fmac_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00] + +v_fmac_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05] + +v_fmac_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x56,0x01,0x77,0x39,0x05] + +v_fmac_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x57,0xff,0x00,0x00,0x00] + +v_ldexp_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] + +v_ldexp_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] + +v_ldexp_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00] + +v_lshlrev_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x30,0x01,0x77,0x39,0x05] + +v_lshlrev_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x30,0x01,0x77,0x39,0x05] + +v_lshlrev_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x31,0xff,0x00,0x00,0x00] + +v_lshrrev_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x32,0x01,0x77,0x39,0x05] + +v_lshrrev_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x32,0x01,0x77,0x39,0x05] + +v_lshrrev_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x33,0xff,0x00,0x00,0x00] + +v_max_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x72,0x01,0x77,0x39,0x05] + +v_max_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x72,0x01,0x77,0x39,0x05] + +v_max_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xfe,0x72,0x7f,0x00,0x00,0x00] + +v_max_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x20,0x01,0x77,0x39,0x05] + +v_max_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x20,0x01,0x77,0x39,0x05] + +v_max_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x21,0xff,0x00,0x00,0x00] + +v_max_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x24,0x01,0x77,0x39,0x05] + +v_max_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x24,0x01,0x77,0x39,0x05] + +v_max_i32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x25,0xff,0x00,0x00,0x00] + +v_max_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x28,0x01,0x77,0x39,0x05] + +v_max_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x28,0x01,0x77,0x39,0x05] + +v_max_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x29,0xff,0x00,0x00,0x00] + +v_min_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x74,0x01,0x77,0x39,0x05] + +v_min_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x74,0x01,0x77,0x39,0x05] + +v_min_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xfe,0x74,0x7f,0x00,0x00,0x00] + +v_min_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x1e,0x01,0x77,0x39,0x05] + +v_min_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x1e,0x01,0x77,0x39,0x05] + +v_min_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x1f,0xff,0x00,0x00,0x00] + +v_min_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x22,0x01,0x77,0x39,0x05] + +v_min_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x22,0x01,0x77,0x39,0x05] + +v_min_i32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x23,0xff,0x00,0x00,0x00] + +v_min_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x26,0x01,0x77,0x39,0x05] + +v_min_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x26,0x01,0x77,0x39,0x05] + +v_min_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x27,0xff,0x00,0x00,0x00] + +v_mul_dx9_zero_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] + +v_mul_dx9_zero_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] + +v_mul_dx9_zero_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00] + +v_mul_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] + +v_mul_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] + +v_mul_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00] + +v_mul_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x10,0x01,0x77,0x39,0x05] + +v_mul_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x10,0x01,0x77,0x39,0x05] + +v_mul_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x11,0xff,0x00,0x00,0x00] + +v_mul_hi_i32_i24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x14,0x01,0x77,0x39,0x05] + +v_mul_hi_i32_i24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x14,0x01,0x77,0x39,0x05] + +v_mul_hi_i32_i24 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x15,0xff,0x00,0x00,0x00] + +v_mul_hi_u32_u24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x18,0x01,0x77,0x39,0x05] + +v_mul_hi_u32_u24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x18,0x01,0x77,0x39,0x05] + +v_mul_hi_u32_u24 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x19,0xff,0x00,0x00,0x00] + +v_mul_i32_i24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x12,0x01,0x77,0x39,0x05] + +v_mul_i32_i24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x12,0x01,0x77,0x39,0x05] + +v_mul_i32_i24 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x13,0xff,0x00,0x00,0x00] + +v_mul_legacy_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] + +v_mul_legacy_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] + +v_mul_legacy_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00] + +v_mul_u32_u24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x16,0x01,0x77,0x39,0x05] + +v_mul_u32_u24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x16,0x01,0x77,0x39,0x05] + +v_mul_u32_u24 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x17,0xff,0x00,0x00,0x00] + +v_or_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x38,0x01,0x77,0x39,0x05] + +v_or_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x38,0x01,0x77,0x39,0x05] + +v_or_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x39,0xff,0x00,0x00,0x00] + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: encoding: [0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: encoding: [0xea,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W32: encoding: [0xe9,0xfe,0xff,0x43,0xff,0x00,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: encoding: [0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: encoding: [0xea,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W64: encoding: [0xe9,0xfe,0xff,0x43,0xff,0x00,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] + +v_sub_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] + +v_sub_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00] + +v_sub_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x08,0x01,0x77,0x39,0x05] + +v_sub_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x08,0x01,0x77,0x39,0x05] + +v_sub_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x09,0xff,0x00,0x00,0x00] + +v_sub_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05] + +v_sub_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05] + +v_sub_nc_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x4d,0xff,0x00,0x00,0x00] + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: encoding: [0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: encoding: [0xea,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W32: encoding: [0xe9,0xfe,0xff,0x45,0xff,0x00,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: encoding: [0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: encoding: [0xea,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W64: encoding: [0xe9,0xfe,0xff,0x45,0xff,0x00,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] + +v_subrev_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] + +v_subrev_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00] + +v_subrev_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05] + +v_subrev_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05] + +v_subrev_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x0b,0xff,0x00,0x00,0x00] + +v_subrev_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] + +v_subrev_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] + +v_subrev_nc_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x4f,0xff,0x00,0x00,0x00] + +v_xnor_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05] + +v_xnor_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05] + +v_xnor_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x3d,0xff,0x00,0x00,0x00] + +v_xor_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: encoding: [0xe9,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05] + +v_xor_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX11: encoding: [0xea,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05] + +v_xor_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX11: encoding: [0xe9,0xfe,0xff,0x3b,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s index d235fcdeb526a..0f19cf0028525 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX11,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] // W32: encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_err-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_err-fake16.s new file mode 100644 index 0000000000000..2d52828d1e283 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_err-fake16.s @@ -0,0 +1,13 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s + +v_fmaak_f32 v0, 0xff32, v0, 0 +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed + +v_fmaak_f16 v0, 0xff32, v0, 0 +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed + +v_fmamk_f32 v0, 0xff32, 1, v0 +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed + +v_fmamk_f16 v0, 0xff32, 1, v0 +// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_err.s index 164a49dcdd47b..dedbcb55d7976 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_err.s @@ -1,4 +1,4 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s v_fmaak_f32 v0, 0xff32, v0, 0 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: only one unique literal operand is allowed diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s index 76b1c38fad43d..dd619f3077f70 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s @@ -1,237 +1,238 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s -v_add_f16_e32 v255, v1, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmaak_f16_e32 v255, v1, v2, 0xfe0b -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_e32 v255, v1, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmamk_f16_e32 v255, v1, 0xfe0b, v3 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_e32 v255.l, v1.l, v2.l -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_add_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_f16_e32 v255, v1, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_f16_e32 v255, v1, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_e32 v255, v1, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_e32 v255, v1, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_e32 v5, v1, v255 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_e32 v255, v1, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_e32 v5, v255, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_e32 v255, v1, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmaak_f16_e32 v255, v1, v2, 0xfe0b +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_e32 v5, v255, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmaak_f16_e32 v5, v1, v255, 0xfe0b +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode v_fmaak_f16_e32 v5, v255, v2, 0xfe0b -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode - -v_fmac_f16_e32 v5, v255, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmamk_f16_e32 v5, v255, 0xfe0b, v3 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_e32 v5.l, v255.l, v2.l -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_fmac_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_f16_e32 v5, v255, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_f16_e32 v5, v255, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_e32 v5, v255, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_e32 v5, v255, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_e32 v5, v255, v2 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_e32 v255, v1, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_e32 v5, v1, v255 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_e32 v5, v1, v255 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmaak_f16_e32 v5, v1, v255, 0xfe0b -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_e32 v5, v255, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_e32 v5, v1, v255 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmamk_f16_e32 v255, v1, 0xfe0b, v3 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode v_fmamk_f16_e32 v5, v1, 0xfe0b, v255 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_e32 v5.l, v1.l, v255.l -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_fmamk_f16_e32 v5, v255, 0xfe0b, v3 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_f16_e32 v5, v1, v255 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v255.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction -v_min_f16_e32 v5, v1, v255 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v255.l, v1.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction -v_mul_f16_e32 v5, v1, v255 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v5.l, v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction -v_sub_f16_e32 v5, v1, v255 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v5.l, v1.l, v255.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction -v_subrev_f16_e32 v5, v1, v255 -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v5.l, v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction -v_add_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v5.l, v255.l, v2.l quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction -v_fmac_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_e32 v255.l, v1.l, v2.l +// GFX11: :[[@LINE-1]]:17: error: invalid operand for instruction -v_ldexp_f16_dpp v255.l, v1.l, v2.l quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_ldexp_f16_e32 v5.l, v1.l, v255.l +// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction -v_max_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_e32 v5.l, v255.l, v2.l +// GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction -v_min_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_dpp v5.l, v255.l, v2.l quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_max_f16_e32 v255, v1, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_f16_e32 v5, v1, v255 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_f16_e32 v5, v255, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_dpp v5.l, v1.l, v255.l quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_min_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_f16_e32 v255, v1, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_f16_e32 v5, v1, v255 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_f16_e32 v5, v255, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_dpp v255.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_mul_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_e32 v255, v1, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_e32 v5, v1, v255 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode + +v_mul_f16_e32 v5, v255, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode v_sub_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_dpp v5.l, v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_sub_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_e32 v255, v1, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_e32 v5, v1, v255 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_e32 v5, v255, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_dpp v5.l, v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction -v_max_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_e32 v255, v1, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_e32 v5, v1, v255 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +v_subrev_f16_e32 v5, v255, v2 +// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s index a5b5f32e97622..a6dcce40fd0e0 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s @@ -1,201 +1,202 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=_e32 %s v_add_f16 v255, v1, v2 -// GFX11: v_add_f16_e64 +// GFX11: v_add_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x32,0xd5,0x01,0x05,0x02,0x00] -v_fmac_f16 v255, v1, v2 -// GFX11: v_fmac_f16_e64 - -v_ldexp_f16 v255, v1, v2 -// GFX11: v_ldexp_f16_e64 - -v_max_f16 v255, v1, v2 -// GFX11: v_max_f16_e64 +v_add_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_min_f16 v255, v1, v2 -// GFX11: v_min_f16_e64 +v_add_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_add_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_mul_f16 v255, v1, v2 -// GFX11: v_mul_f16_e64 +v_add_f16 v5, v1, v255 +// GFX11: v_add_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x32,0xd5,0x01,0xff,0x03,0x00] -v_sub_f16 v255, v1, v2 -// GFX11: v_sub_f16_e64 +v_add_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_subrev_f16 v255, v1, v2 -// GFX11: v_subrev_f16_e64 +v_add_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: v_add_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] v_add_f16 v5, v255, v2 -// GFX11: v_add_f16_e64 - -v_fmac_f16 v5, v255, v2 -// GFX11: v_fmac_f16_e64 +// GFX11: v_add_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x32,0xd5,0xff,0x05,0x02,0x00] -v_ldexp_f16 v5, v255, v2 -// GFX11: v_ldexp_f16_e64 +v_add_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_add_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_max_f16 v5, v255, v2 -// GFX11: v_max_f16_e64 +v_add_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: v_add_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_min_f16 v5, v255, v2 -// GFX11: v_min_f16_e64 +v_fmac_f16 v255, v1, v2 +// GFX11: v_fmac_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x36,0xd5,0x01,0x05,0x02,0x00] -v_mul_f16 v5, v255, v2 -// GFX11: v_mul_f16_e64 +v_fmac_f16 v5, v1, v255 +// GFX11: v_fmac_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x36,0xd5,0x01,0xff,0x03,0x00] -v_sub_f16 v5, v255, v2 -// GFX11: v_sub_f16_e64 +v_fmac_f16 v5, v255, v2 +// GFX11: v_fmac_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x36,0xd5,0xff,0x05,0x02,0x00] -v_subrev_f16 v5, v255, v2 -// GFX11: v_subrev_f16_e64 +v_ldexp_f16 v255, v1, v2 +// GFX11: v_ldexp_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x3b,0xd5,0x01,0x05,0x02,0x00] -v_add_f16 v5, v1, v255 -// GFX11: v_add_f16_e64 +v_ldexp_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_ldexp_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_fmac_f16 v5, v1, v255 -// GFX11: v_fmac_f16_e64 +v_ldexp_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_ldexp_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] v_ldexp_f16 v5, v1, v255 -// GFX11: v_ldexp_f16_e64 +// GFX11: v_ldexp_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x3b,0xd5,0x01,0xff,0x03,0x00] -v_max_f16 v5, v1, v255 -// GFX11: v_max_f16_e64 +v_ldexp_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_ldexp_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_min_f16 v5, v1, v255 -// GFX11: v_min_f16_e64 +v_ldexp_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: v_ldexp_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_mul_f16 v5, v1, v255 -// GFX11: v_mul_f16_e64 +v_ldexp_f16 v5, v255, v2 +// GFX11: v_ldexp_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x3b,0xd5,0xff,0x05,0x02,0x00] -v_sub_f16 v5, v1, v255 -// GFX11: v_sub_f16_e64 +v_ldexp_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_ldexp_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_subrev_f16 v5, v1, v255 -// GFX11: v_subrev_f16_e64 +v_ldexp_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: v_ldexp_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_add_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_add_f16_e64 +v_max_f16 v255, v1, v2 +// GFX11: v_max_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x39,0xd5,0x01,0x05,0x02,0x00] -v_ldexp_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_ldexp_f16_e64 +v_max_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_max_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x39,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] v_max_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_max_f16_e64 - -v_min_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_min_f16_e64 +// GFX11: v_max_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x39,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_mul_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_mul_f16_e64 +v_max_f16 v5, v1, v255 +// GFX11: v_max_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x39,0xd5,0x01,0xff,0x03,0x00] -v_sub_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_sub_f16_e64 +v_max_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_max_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_subrev_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX11: v_subrev_f16_e64 +v_max_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: v_max_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_add_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_add_f16_e64 +v_max_f16 v5, v255, v2 +// GFX11: v_max_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x39,0xd5,0xff,0x05,0x02,0x00] -v_ldexp_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_ldexp_f16_e64 +v_max_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_max_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] v_max_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_max_f16_e64 +// GFX11: v_max_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_min_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_min_f16_e64 +v_min_f16 v255, v1, v2 +// GFX11: v_min_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x3a,0xd5,0x01,0x05,0x02,0x00] -v_mul_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_mul_f16_e64 +v_min_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_min_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x3a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_sub_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_sub_f16_e64 +v_min_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_min_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x3a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_subrev_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX11: v_subrev_f16_e64 +v_min_f16 v5, v1, v255 +// GFX11: v_min_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x3a,0xd5,0x01,0xff,0x03,0x00] -v_add_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_add_f16_e64 +v_min_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_min_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_ldexp_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_ldexp_f16_e64 +v_min_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: v_min_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_max_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_max_f16_e64 +v_min_f16 v5, v255, v2 +// GFX11: v_min_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x3a,0xd5,0xff,0x05,0x02,0x00] -v_min_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_min_f16_e64 +v_min_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_min_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3a,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_mul_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_mul_f16_e64 +v_min_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: v_min_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3a,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_sub_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_sub_f16_e64 +v_mul_f16 v255, v1, v2 +// GFX11: v_mul_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x35,0xd5,0x01,0x05,0x02,0x00] -v_subrev_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX11: v_subrev_f16_e64 +v_mul_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_mul_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_add_f16_e64 +v_mul_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_mul_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_ldexp_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_ldexp_f16_e64 +v_mul_f16 v5, v1, v255 +// GFX11: v_mul_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x35,0xd5,0x01,0xff,0x03,0x00] -v_max_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_max_f16_e64 +v_mul_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_mul_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x35,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_min_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_min_f16_e64 +v_mul_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: v_mul_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_mul_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_mul_f16_e64 +v_mul_f16 v5, v255, v2 +// GFX11: v_mul_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x35,0xd5,0xff,0x05,0x02,0x00] -v_sub_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sub_f16_e64 +v_mul_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_mul_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_subrev_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_subrev_f16_e64 +v_mul_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: v_mul_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_add_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_add_f16_e64 +v_sub_f16 v255, v1, v2 +// GFX11: v_sub_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x33,0xd5,0x01,0x05,0x02,0x00] -v_ldexp_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_ldexp_f16_e64 +v_sub_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sub_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_max_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_max_f16_e64 +v_sub_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_sub_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_min_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_min_f16_e64 +v_sub_f16 v5, v1, v255 +// GFX11: v_sub_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x33,0xd5,0x01,0xff,0x03,0x00] -v_mul_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_mul_f16_e64 +v_sub_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sub_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_sub_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sub_f16_e64 +v_sub_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: v_sub_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_subrev_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_subrev_f16_e64 +v_sub_f16 v5, v255, v2 +// GFX11: v_sub_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x33,0xd5,0xff,0x05,0x02,0x00] -v_add_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_add_f16_e64 +v_sub_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_sub_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_ldexp_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_ldexp_f16_e64 +v_sub_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: v_sub_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_max_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_max_f16_e64 +v_subrev_f16 v255, v1, v2 +// GFX11: v_subrev_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x34,0xd5,0x01,0x05,0x02,0x00] -v_min_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_min_f16_e64 +v_subrev_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_subrev_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_mul_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_mul_f16_e64 +v_subrev_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX11: v_subrev_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_sub_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_sub_f16_e64 +v_subrev_f16 v5, v1, v255 +// GFX11: v_subrev_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x34,0xd5,0x01,0xff,0x03,0x00] v_subrev_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX11: v_subrev_f16_e64 +// GFX11: v_subrev_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] + +v_subrev_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX11: v_subrev_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] + +v_subrev_f16 v5, v255, v2 +// GFX11: v_subrev_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x34,0xd5,0xff,0x05,0x02,0x00] + +v_subrev_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX11: v_subrev_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] + +v_subrev_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX11: v_subrev_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2-fake16.s new file mode 100644 index 0000000000000..4c37502e1b247 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2-fake16.s @@ -0,0 +1,2560 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s + +v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo +// W32: encoding: [0x01,0x05,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v255, v2, vcc_lo +// W32: encoding: [0xff,0x05,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, s1, v2, vcc_lo +// W32: encoding: [0x01,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, s105, v2, vcc_lo +// W32: encoding: [0x69,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, vcc_lo, v2, vcc_lo +// W32: encoding: [0x6a,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, vcc_hi, v2, vcc_lo +// W32: encoding: [0x6b,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, ttmp15, v2, vcc_lo +// W32: encoding: [0x7b,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, m0, v2, vcc_lo +// W32: encoding: [0x7d,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, exec_lo, v2, vcc_lo +// W32: encoding: [0x7e,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, exec_hi, v2, vcc_lo +// W32: encoding: [0x7f,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, null, v2, vcc_lo +// W32: encoding: [0x7c,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, -1, v2, vcc_lo +// W32: encoding: [0xc1,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, 0.5, v2, vcc_lo +// W32: encoding: [0xf0,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, src_scc, v2, vcc_lo +// W32: encoding: [0xfd,0x04,0x0a,0x40] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc_lo, 0xaf123456, v255, vcc_lo +// W32: encoding: [0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc +// W64: encoding: [0x01,0x05,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v255, v2, vcc +// W64: encoding: [0xff,0x05,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, s1, v2, vcc +// W64: encoding: [0x01,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, s105, v2, vcc +// W64: encoding: [0x69,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, vcc_lo, v2, vcc +// W64: encoding: [0x6a,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, vcc_hi, v2, vcc +// W64: encoding: [0x6b,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, ttmp15, v2, vcc +// W64: encoding: [0x7b,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, m0, v2, vcc +// W64: encoding: [0x7d,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, exec_lo, v2, vcc +// W64: encoding: [0x7e,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, exec_hi, v2, vcc +// W64: encoding: [0x7f,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, null, v2, vcc +// W64: encoding: [0x7c,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, -1, v2, vcc +// W64: encoding: [0xc1,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, 0.5, v2, vcc +// W64: encoding: [0xf0,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, src_scc, v2, vcc +// W64: encoding: [0xfd,0x04,0x0a,0x40] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc, 0xaf123456, v255, vcc +// W64: encoding: [0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x64] + +v_add_f16 v5, v127, v2 +// GFX12: encoding: [0x7f,0x05,0x0a,0x64] + +v_add_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x64] + +v_add_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x64] + +v_add_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x64] + +v_add_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x64] + +v_add_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x64] + +v_add_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x64] + +v_add_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x64] + +v_add_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x64] + +v_add_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x64] + +v_add_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x64] + +v_add_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x64] + +v_add_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x64] + +v_add_f16 v127, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00] + +v_add_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x06] + +v_add_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x06] + +v_add_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x06] + +v_add_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x06] + +v_add_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x06] + +v_add_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x06] + +v_add_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x06] + +v_add_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x06] + +v_add_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x06] + +v_add_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x06] + +v_add_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x06] + +v_add_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x06] + +v_add_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x06] + +v_add_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x06] + +v_add_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x07,0x56,0x34,0x12,0xaf] + +v_add_f64 v[5:6], v[1:2], v[3:4] +// GFX12: encoding: [0x01,0x07,0x0a,0x04] + +v_add_f64 v[5:6], v[254:255], v[2:3] +// GFX12: encoding: [0xfe,0x05,0x0a,0x04] + +v_add_f64 v[5:6], s[0:1], v[2:3] +// GFX12: encoding: [0x00,0x04,0x0a,0x04] + +v_add_f64 v[5:6], s[104:105], v[2:3] +// GFX12: encoding: [0x68,0x04,0x0a,0x04] + +v_add_f64 v[5:6], vcc, v[2:3] +// GFX12: encoding: [0x6a,0x04,0x0a,0x04] + +v_add_f64 v[5:6], ttmp[14:15], v[2:3] +// GFX12: encoding: [0x7a,0x04,0x0a,0x04] + +v_add_f64 v[5:6], exec, v[2:3] +// GFX12: encoding: [0x7e,0x04,0x0a,0x04] + +v_add_f64 v[5:6], null, v[2:3] +// GFX12: encoding: [0x7c,0x04,0x0a,0x04] + +v_add_f64 v[5:6], -1, v[2:3] +// GFX12: encoding: [0xc1,0x04,0x0a,0x04] + +v_add_f64 v[5:6], 0.5, v[2:3] +// GFX12: encoding: [0xf0,0x04,0x0a,0x04] + +v_add_f64 v[5:6], src_scc, v[2:3] +// GFX12: encoding: [0xfd,0x04,0x0a,0x04] + +v_add_f64 v[254:255], 0xaf123456, v[254:255] +// GFX12: encoding: [0xff,0xfc,0xfd,0x05,0x56,0x34,0x12,0xaf] + +v_add_nc_u32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x4a] + +v_add_nc_u32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x4a] + +v_add_nc_u32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x4a] + +v_add_nc_u32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x4a] + +v_add_nc_u32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x4b,0x56,0x34,0x12,0xaf] + +v_and_b32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x36] + +v_and_b32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x36] + +v_and_b32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x36] + +v_and_b32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x36] + +v_and_b32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x36] + +v_and_b32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x36] + +v_and_b32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x36] + +v_and_b32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x36] + +v_and_b32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x36] + +v_and_b32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x36] + +v_and_b32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x36] + +v_and_b32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x36] + +v_and_b32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x36] + +v_and_b32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x36] + +v_and_b32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x37,0x56,0x34,0x12,0xaf] + +v_ashrrev_i32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x34] + +v_ashrrev_i32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x34] + +v_ashrrev_i32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x34] + +v_ashrrev_i32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x34] + +v_ashrrev_i32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x35,0x56,0x34,0x12,0xaf] + +v_cndmask_b32 v5, v1, v2, vcc_lo +// W32: encoding: [0x01,0x05,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v255, v2, vcc_lo +// W32: encoding: [0xff,0x05,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, s1, v2, vcc_lo +// W32: encoding: [0x01,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, s105, v2, vcc_lo +// W32: encoding: [0x69,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, vcc_lo, v2, vcc_lo +// W32: encoding: [0x6a,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, vcc_hi, v2, vcc_lo +// W32: encoding: [0x6b,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, ttmp15, v2, vcc_lo +// W32: encoding: [0x7b,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, m0, v2, vcc_lo +// W32: encoding: [0x7d,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, exec_lo, v2, vcc_lo +// W32: encoding: [0x7e,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, exec_hi, v2, vcc_lo +// W32: encoding: [0x7f,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, null, v2, vcc_lo +// W32: encoding: [0x7c,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, -1, v2, vcc_lo +// W32: encoding: [0xc1,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, 0.5, v2, vcc_lo +// W32: encoding: [0xf0,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, src_scc, v2, vcc_lo +// W32: encoding: [0xfd,0x04,0x0a,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, 0xaf123456, v255, vcc_lo +// W32: encoding: [0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc +// W64: encoding: [0x01,0x05,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v255, v2, vcc +// W64: encoding: [0xff,0x05,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, s1, v2, vcc +// W64: encoding: [0x01,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, s105, v2, vcc +// W64: encoding: [0x69,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, vcc_lo, v2, vcc +// W64: encoding: [0x6a,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, vcc_hi, v2, vcc +// W64: encoding: [0x6b,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, ttmp15, v2, vcc +// W64: encoding: [0x7b,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, m0, v2, vcc +// W64: encoding: [0x7d,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, exec_lo, v2, vcc +// W64: encoding: [0x7e,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, exec_hi, v2, vcc +// W64: encoding: [0x7f,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, null, v2, vcc +// W64: encoding: [0x7c,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, -1, v2, vcc +// W64: encoding: [0xc1,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, 0.5, v2, vcc +// W64: encoding: [0xf0,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, src_scc, v2, vcc +// W64: encoding: [0xfd,0x04,0x0a,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, 0xaf123456, v255, vcc +// W64: encoding: [0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x5e] + +v_cvt_pk_rtz_f16_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x5e] + +v_cvt_pkrtz_f16_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf] + +v_fmaak_f16 v5, v1, v2, 0xfe0b +// GFX12: encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, v127, v2, 0xfe0b +// GFX12: encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, s1, v2, 0xfe0b +// GFX12: encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, s105, v2, 0xfe0b +// GFX12: encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b +// GFX12: encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b +// GFX12: encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, ttmp15, v2, 0xfe0b +// GFX12: encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, m0, v2, 0xfe0b +// GFX12: encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, exec_lo, v2, 0xfe0b +// GFX12: encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, exec_hi, v2, 0xfe0b +// GFX12: encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, null, v2, 0xfe0b +// GFX12: encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, -1, v2, 0xfe0b +// GFX12: encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, 0.5, v2, 0xfe0b +// GFX12: encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v5, src_scc, v2, 0xfe0b +// GFX12: encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b +// GFX12: encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00] + +v_fmaak_f32 v5, v1, v2, 0xaf123456 +// GFX12: encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, v255, v2, 0xaf123456 +// GFX12: encoding: [0xff,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, s1, v2, 0xaf123456 +// GFX12: encoding: [0x01,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, s105, v2, 0xaf123456 +// GFX12: encoding: [0x69,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, vcc_lo, v2, 0xaf123456 +// GFX12: encoding: [0x6a,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, vcc_hi, v2, 0xaf123456 +// GFX12: encoding: [0x6b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, ttmp15, v2, 0xaf123456 +// GFX12: encoding: [0x7b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, m0, v2, 0xaf123456 +// GFX12: encoding: [0x7d,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, exec_lo, v2, 0xaf123456 +// GFX12: encoding: [0x7e,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, exec_hi, v2, 0xaf123456 +// GFX12: encoding: [0x7f,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, null, v2, 0xaf123456 +// GFX12: encoding: [0x7c,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, -1, v2, 0xaf123456 +// GFX12: encoding: [0xc1,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, 0.5, v2, 0xaf123456 +// GFX12: encoding: [0xf0,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v5, src_scc, v2, 0xaf123456 +// GFX12: encoding: [0xfd,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] + +v_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 +// GFX12: encoding: [0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf] + +v_fmac_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x6c] + +v_fmac_f16 v5, v127, v2 +// GFX12: encoding: [0x7f,0x05,0x0a,0x6c] + +v_fmac_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x6c] + +v_fmac_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x6c] + +v_fmac_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x6c] + +v_fmac_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x6c] + +v_fmac_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x6c] + +v_fmac_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x6c] + +v_fmac_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x6c] + +v_fmac_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x6c] + +v_fmac_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x6c] + +v_fmac_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x6c] + +v_fmac_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x6c] + +v_fmac_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x6c] + +v_fmac_f16 v127, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00] + +v_fmac_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x56] + +v_fmac_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x56] + +v_fmac_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x56] + +v_fmac_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x56] + +v_fmac_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x56] + +v_fmac_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x56] + +v_fmac_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x56] + +v_fmac_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x56] + +v_fmac_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x56] + +v_fmac_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x56] + +v_fmac_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x56] + +v_fmac_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x56] + +v_fmac_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x56] + +v_fmac_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x56] + +v_fmac_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf] + +v_fmamk_f16 v5, v1, 0xfe0b, v3 +// GFX12: encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, v127, 0xfe0b, v3 +// GFX12: encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, s1, 0xfe0b, v3 +// GFX12: encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, s105, 0xfe0b, v3 +// GFX12: encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3 +// GFX12: encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3 +// GFX12: encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, ttmp15, 0xfe0b, v3 +// GFX12: encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, m0, 0xfe0b, v3 +// GFX12: encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, exec_lo, 0xfe0b, v3 +// GFX12: encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, exec_hi, 0xfe0b, v3 +// GFX12: encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, null, 0xfe0b, v3 +// GFX12: encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, -1, 0xfe0b, v3 +// GFX12: encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, 0.5, 0xfe0b, v3 +// GFX12: encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v5, src_scc, 0xfe0b, v3 +// GFX12: encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00] + +v_fmamk_f32 v5, v1, 0xaf123456, v3 +// GFX12: encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, v255, 0xaf123456, v3 +// GFX12: encoding: [0xff,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, s1, 0xaf123456, v3 +// GFX12: encoding: [0x01,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, s105, 0xaf123456, v3 +// GFX12: encoding: [0x69,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, vcc_lo, 0xaf123456, v3 +// GFX12: encoding: [0x6a,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, vcc_hi, 0xaf123456, v3 +// GFX12: encoding: [0x6b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, ttmp15, 0xaf123456, v3 +// GFX12: encoding: [0x7b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, m0, 0xaf123456, v3 +// GFX12: encoding: [0x7d,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, exec_lo, 0xaf123456, v3 +// GFX12: encoding: [0x7e,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, exec_hi, 0xaf123456, v3 +// GFX12: encoding: [0x7f,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, null, 0xaf123456, v3 +// GFX12: encoding: [0x7c,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, -1, 0xaf123456, v3 +// GFX12: encoding: [0xc1,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, 0.5, 0xaf123456, v3 +// GFX12: encoding: [0xf0,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v5, src_scc, 0xaf123456, v3 +// GFX12: encoding: [0xfd,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] + +v_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf] + +v_ldexp_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x76] + +v_ldexp_f16 v5, v127, v2 +// GFX12: encoding: [0x7f,0x05,0x0a,0x76] + +v_ldexp_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x76] + +v_ldexp_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x76] + +v_ldexp_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x76] + +v_ldexp_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x76] + +v_ldexp_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x76] + +v_ldexp_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x76] + +v_ldexp_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x76] + +v_ldexp_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x76] + +v_ldexp_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x76] + +v_ldexp_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x76] + +v_ldexp_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x76] + +v_ldexp_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x76] + +v_ldexp_f16 v127, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] + +v_lshlrev_b32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x30] + +v_lshlrev_b32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x30] + +v_lshlrev_b32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x30] + +v_lshlrev_b32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x30] + +v_lshlrev_b32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x31,0x56,0x34,0x12,0xaf] + +v_lshlrev_b64 v[5:6], v1, v[3:4] +// GFX12: encoding: [0x01,0x07,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], v255, v[2:3] +// GFX12: encoding: [0xff,0x05,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], s1, v[2:3] +// GFX12: encoding: [0x01,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], s105, v[2:3] +// GFX12: encoding: [0x69,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], vcc_lo, v[2:3] +// GFX12: encoding: [0x6a,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], vcc_hi, v[2:3] +// GFX12: encoding: [0x6b,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], ttmp15, v[2:3] +// GFX12: encoding: [0x7b,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], exec_lo, v[2:3] +// GFX12: encoding: [0x7e,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], exec_hi, v[2:3] +// GFX12: encoding: [0x7f,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], null, v[2:3] +// GFX12: encoding: [0x7c,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], -1, v[2:3] +// GFX12: encoding: [0xc1,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], 0.5, v[2:3] +// GFX12: encoding: [0xf0,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[5:6], src_scc, v[2:3] +// GFX12: encoding: [0xfd,0x04,0x0a,0x3e] + +v_lshlrev_b64 v[254:255], 0xaf123456, v[254:255] +// GFX12: encoding: [0xff,0xfc,0xfd,0x3f,0x56,0x34,0x12,0xaf] + +v_lshrrev_b32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x32] + +v_lshrrev_b32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x32] + +v_lshrrev_b32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x32] + +v_lshrrev_b32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x32] + +v_lshrrev_b32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x33,0x56,0x34,0x12,0xaf] + +v_max_num_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x62] + +v_max_num_f16 v5, v127, v2 +// GFX12: encoding: [0x7f,0x05,0x0a,0x62] + +v_max_num_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x62] + +v_max_num_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x62] + +v_max_num_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x62] + +v_max_num_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x62] + +v_max_num_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x62] + +v_max_num_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x62] + +v_max_num_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x62] + +v_max_num_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x62] + +v_max_num_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x62] + +v_max_num_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x62] + +v_max_num_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x62] + +v_max_num_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x62] + +v_max_num_f16 v127, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x62,0x0b,0xfe,0x00,0x00] + +v_max_num_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x2c] + +v_max_num_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x2c] + +v_max_num_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x2c] + +v_max_num_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x2c] + +v_max_num_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x2c] + +v_max_num_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x2c] + +v_max_num_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x2c] + +v_max_num_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x2c] + +v_max_num_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x2c] + +v_max_num_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x2c] + +v_max_num_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x2c] + +v_max_num_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x2c] + +v_max_num_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x2c] + +v_max_num_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x2c] + +v_max_num_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x2d,0x56,0x34,0x12,0xaf] + +v_max_num_f64 v[5:6], v[1:2], v[3:4] +// GFX12: encoding: [0x01,0x07,0x0a,0x1c] + +v_max_num_f64 v[5:6], v[254:255], v[2:3] +// GFX12: encoding: [0xfe,0x05,0x0a,0x1c] + +v_max_num_f64 v[5:6], s[0:1], v[2:3] +// GFX12: encoding: [0x00,0x04,0x0a,0x1c] + +v_max_num_f64 v[5:6], s[104:105], v[2:3] +// GFX12: encoding: [0x68,0x04,0x0a,0x1c] + +v_max_num_f64 v[5:6], vcc, v[2:3] +// GFX12: encoding: [0x6a,0x04,0x0a,0x1c] + +v_max_num_f64 v[5:6], ttmp[14:15], v[2:3] +// GFX12: encoding: [0x7a,0x04,0x0a,0x1c] + +v_max_num_f64 v[5:6], exec, v[2:3] +// GFX12: encoding: [0x7e,0x04,0x0a,0x1c] + +v_max_num_f64 v[5:6], null, v[2:3] +// GFX12: encoding: [0x7c,0x04,0x0a,0x1c] + +v_max_num_f64 v[5:6], -1, v[2:3] +// GFX12: encoding: [0xc1,0x04,0x0a,0x1c] + +v_max_num_f64 v[5:6], 0.5, v[2:3] +// GFX12: encoding: [0xf0,0x04,0x0a,0x1c] + +v_max_num_f64 v[5:6], src_scc, v[2:3] +// GFX12: encoding: [0xfd,0x04,0x0a,0x1c] + +v_max_num_f64 v[254:255], 0xaf123456, v[254:255] +// GFX12: encoding: [0xff,0xfc,0xfd,0x1d,0x56,0x34,0x12,0xaf] + +v_max_i32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x24] + +v_max_i32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x24] + +v_max_i32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x24] + +v_max_i32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x24] + +v_max_i32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x24] + +v_max_i32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x24] + +v_max_i32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x24] + +v_max_i32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x24] + +v_max_i32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x24] + +v_max_i32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x24] + +v_max_i32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x24] + +v_max_i32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x24] + +v_max_i32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x24] + +v_max_i32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x24] + +v_max_i32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x25,0x56,0x34,0x12,0xaf] + +v_max_u32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x28] + +v_max_u32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x28] + +v_max_u32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x28] + +v_max_u32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x28] + +v_max_u32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x28] + +v_max_u32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x28] + +v_max_u32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x28] + +v_max_u32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x28] + +v_max_u32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x28] + +v_max_u32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x28] + +v_max_u32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x28] + +v_max_u32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x28] + +v_max_u32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x28] + +v_max_u32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x28] + +v_max_u32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x29,0x56,0x34,0x12,0xaf] + +v_min_num_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x60] + +v_min_num_f16 v5, v127, v2 +// GFX12: encoding: [0x7f,0x05,0x0a,0x60] + +v_min_num_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x60] + +v_min_num_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x60] + +v_min_num_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x60] + +v_min_num_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x60] + +v_min_num_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x60] + +v_min_num_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x60] + +v_min_num_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x60] + +v_min_num_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x60] + +v_min_num_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x60] + +v_min_num_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x60] + +v_min_num_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x60] + +v_min_num_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x60] + +v_min_num_f16 v127, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x60,0x0b,0xfe,0x00,0x00] + +v_min_num_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x2a] + +v_min_num_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x2a] + +v_min_num_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x2a] + +v_min_num_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x2a] + +v_min_num_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x2a] + +v_min_num_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x2a] + +v_min_num_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x2a] + +v_min_num_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x2a] + +v_min_num_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x2a] + +v_min_num_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x2a] + +v_min_num_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x2a] + +v_min_num_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x2a] + +v_min_num_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x2a] + +v_min_num_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x2a] + +v_min_num_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x2b,0x56,0x34,0x12,0xaf] + +v_min_num_f64 v[5:6], v[1:2], v[3:4] +// GFX12: encoding: [0x01,0x07,0x0a,0x1a] + +v_min_num_f64 v[5:6], v[254:255], v[2:3] +// GFX12: encoding: [0xfe,0x05,0x0a,0x1a] + +v_min_num_f64 v[5:6], s[0:1], v[2:3] +// GFX12: encoding: [0x00,0x04,0x0a,0x1a] + +v_min_num_f64 v[5:6], s[104:105], v[2:3] +// GFX12: encoding: [0x68,0x04,0x0a,0x1a] + +v_min_num_f64 v[5:6], vcc, v[2:3] +// GFX12: encoding: [0x6a,0x04,0x0a,0x1a] + +v_min_num_f64 v[5:6], ttmp[14:15], v[2:3] +// GFX12: encoding: [0x7a,0x04,0x0a,0x1a] + +v_min_num_f64 v[5:6], exec, v[2:3] +// GFX12: encoding: [0x7e,0x04,0x0a,0x1a] + +v_min_num_f64 v[5:6], null, v[2:3] +// GFX12: encoding: [0x7c,0x04,0x0a,0x1a] + +v_min_num_f64 v[5:6], -1, v[2:3] +// GFX12: encoding: [0xc1,0x04,0x0a,0x1a] + +v_min_num_f64 v[5:6], 0.5, v[2:3] +// GFX12: encoding: [0xf0,0x04,0x0a,0x1a] + +v_min_num_f64 v[5:6], src_scc, v[2:3] +// GFX12: encoding: [0xfd,0x04,0x0a,0x1a] + +v_min_num_f64 v[254:255], 0xaf123456, v[254:255] +// GFX12: encoding: [0xff,0xfc,0xfd,0x1b,0x56,0x34,0x12,0xaf] + +v_min_i32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x22] + +v_min_i32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x22] + +v_min_i32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x22] + +v_min_i32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x22] + +v_min_i32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x22] + +v_min_i32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x22] + +v_min_i32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x22] + +v_min_i32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x22] + +v_min_i32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x22] + +v_min_i32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x22] + +v_min_i32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x22] + +v_min_i32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x22] + +v_min_i32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x22] + +v_min_i32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x22] + +v_min_i32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x23,0x56,0x34,0x12,0xaf] + +v_min_u32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x26] + +v_min_u32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x26] + +v_min_u32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x26] + +v_min_u32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x26] + +v_min_u32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x26] + +v_min_u32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x26] + +v_min_u32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x26] + +v_min_u32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x26] + +v_min_u32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x26] + +v_min_u32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x26] + +v_min_u32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x26] + +v_min_u32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x26] + +v_min_u32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x26] + +v_min_u32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x26] + +v_min_u32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x27,0x56,0x34,0x12,0xaf] + +v_mul_dx9_zero_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x0e] + +v_mul_dx9_zero_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf] + +v_mul_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x6a] + +v_mul_f16 v5, v127, v2 +// GFX12: encoding: [0x7f,0x05,0x0a,0x6a] + +v_mul_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x6a] + +v_mul_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x6a] + +v_mul_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x6a] + +v_mul_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x6a] + +v_mul_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x6a] + +v_mul_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x6a] + +v_mul_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x6a] + +v_mul_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x6a] + +v_mul_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x6a] + +v_mul_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x6a] + +v_mul_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x6a] + +v_mul_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x6a] + +v_mul_f16 v127, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00] + +v_mul_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x10] + +v_mul_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x10] + +v_mul_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x10] + +v_mul_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x10] + +v_mul_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x10] + +v_mul_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x10] + +v_mul_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x10] + +v_mul_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x10] + +v_mul_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x10] + +v_mul_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x10] + +v_mul_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x10] + +v_mul_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x10] + +v_mul_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x10] + +v_mul_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x10] + +v_mul_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x11,0x56,0x34,0x12,0xaf] + +v_mul_f64 v[5:6], v[1:2], v[3:4] +// GFX12: encoding: [0x01,0x07,0x0a,0x0c] + +v_mul_f64 v[5:6], v[254:255], v[2:3] +// GFX12: encoding: [0xfe,0x05,0x0a,0x0c] + +v_mul_f64 v[5:6], s[0:1], v[2:3] +// GFX12: encoding: [0x00,0x04,0x0a,0x0c] + +v_mul_f64 v[5:6], s[104:105], v[2:3] +// GFX12: encoding: [0x68,0x04,0x0a,0x0c] + +v_mul_f64 v[5:6], vcc, v[2:3] +// GFX12: encoding: [0x6a,0x04,0x0a,0x0c] + +v_mul_f64 v[5:6], ttmp[14:15], v[2:3] +// GFX12: encoding: [0x7a,0x04,0x0a,0x0c] + +v_mul_f64 v[5:6], exec, v[2:3] +// GFX12: encoding: [0x7e,0x04,0x0a,0x0c] + +v_mul_f64 v[5:6], null, v[2:3] +// GFX12: encoding: [0x7c,0x04,0x0a,0x0c] + +v_mul_f64 v[5:6], -1, v[2:3] +// GFX12: encoding: [0xc1,0x04,0x0a,0x0c] + +v_mul_f64 v[5:6], 0.5, v[2:3] +// GFX12: encoding: [0xf0,0x04,0x0a,0x0c] + +v_mul_f64 v[5:6], src_scc, v[2:3] +// GFX12: encoding: [0xfd,0x04,0x0a,0x0c] + +v_mul_f64 v[254:255], 0xaf123456, v[254:255] +// GFX12: encoding: [0xff,0xfc,0xfd,0x0d,0x56,0x34,0x12,0xaf] + +v_mul_hi_i32_i24 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x14] + +v_mul_hi_i32_i24 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x14] + +v_mul_hi_i32_i24 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x14] + +v_mul_hi_i32_i24 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x15,0x56,0x34,0x12,0xaf] + +v_mul_hi_u32_u24 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x18] + +v_mul_hi_u32_u24 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x18] + +v_mul_hi_u32_u24 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x18] + +v_mul_hi_u32_u24 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x19,0x56,0x34,0x12,0xaf] + +v_mul_i32_i24 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x12] + +v_mul_i32_i24 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x12] + +v_mul_i32_i24 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x12] + +v_mul_i32_i24 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x12] + +v_mul_i32_i24 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x13,0x56,0x34,0x12,0xaf] + +v_mul_legacy_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x0e] + +v_mul_legacy_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x0e] + +v_mul_legacy_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x0e] + +v_mul_legacy_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf] + +v_mul_u32_u24 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x16] + +v_mul_u32_u24 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x16] + +v_mul_u32_u24 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x16] + +v_mul_u32_u24 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x16] + +v_mul_u32_u24 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x17,0x56,0x34,0x12,0xaf] + +v_or_b32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x38] + +v_or_b32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x38] + +v_or_b32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x38] + +v_or_b32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x38] + +v_or_b32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x38] + +v_or_b32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x38] + +v_or_b32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x38] + +v_or_b32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x38] + +v_or_b32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x38] + +v_or_b32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x38] + +v_or_b32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x38] + +v_or_b32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x38] + +v_or_b32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x38] + +v_or_b32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x38] + +v_or_b32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x39,0x56,0x34,0x12,0xaf] + +v_pk_fmac_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x78] + +v_pk_fmac_f16 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x78] + +v_pk_fmac_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x78] + +v_pk_fmac_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x78] + +v_pk_fmac_f16 v255, 0xfe0b, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x79,0x0b,0xfe,0x00,0x00] + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo +// W32: encoding: [0x01,0x05,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v255, v2, vcc_lo +// W32: encoding: [0xff,0x05,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, s1, v2, vcc_lo +// W32: encoding: [0x01,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, s105, v2, vcc_lo +// W32: encoding: [0x69,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, vcc_lo, v2, vcc_lo +// W32: encoding: [0x6a,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, vcc_hi, v2, vcc_lo +// W32: encoding: [0x6b,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, ttmp15, v2, vcc_lo +// W32: encoding: [0x7b,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, m0, v2, vcc_lo +// W32: encoding: [0x7d,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, exec_lo, v2, vcc_lo +// W32: encoding: [0x7e,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, exec_hi, v2, vcc_lo +// W32: encoding: [0x7f,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, null, v2, vcc_lo +// W32: encoding: [0x7c,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, -1, v2, vcc_lo +// W32: encoding: [0xc1,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, 0.5, v2, vcc_lo +// W32: encoding: [0xf0,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, src_scc, v2, vcc_lo +// W32: encoding: [0xfd,0x04,0x0a,0x42] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc_lo, 0xaf123456, v255, vcc_lo +// W32: encoding: [0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc +// W64: encoding: [0x01,0x05,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v255, v2, vcc +// W64: encoding: [0xff,0x05,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, s1, v2, vcc +// W64: encoding: [0x01,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, s105, v2, vcc +// W64: encoding: [0x69,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, vcc_lo, v2, vcc +// W64: encoding: [0x6a,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, vcc_hi, v2, vcc +// W64: encoding: [0x6b,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, ttmp15, v2, vcc +// W64: encoding: [0x7b,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, m0, v2, vcc +// W64: encoding: [0x7d,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, exec_lo, v2, vcc +// W64: encoding: [0x7e,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, exec_hi, v2, vcc +// W64: encoding: [0x7f,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, null, v2, vcc +// W64: encoding: [0x7c,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, -1, v2, vcc +// W64: encoding: [0xc1,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, 0.5, v2, vcc +// W64: encoding: [0xf0,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, src_scc, v2, vcc +// W64: encoding: [0xfd,0x04,0x0a,0x42] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc, 0xaf123456, v255, vcc +// W64: encoding: [0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x66] + +v_sub_f16 v5, v127, v2 +// GFX12: encoding: [0x7f,0x05,0x0a,0x66] + +v_sub_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x66] + +v_sub_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x66] + +v_sub_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x66] + +v_sub_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x66] + +v_sub_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x66] + +v_sub_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x66] + +v_sub_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x66] + +v_sub_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x66] + +v_sub_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x66] + +v_sub_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x66] + +v_sub_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x66] + +v_sub_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x66] + +v_sub_f16 v127, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00] + +v_sub_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x08] + +v_sub_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x08] + +v_sub_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x08] + +v_sub_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x08] + +v_sub_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x08] + +v_sub_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x08] + +v_sub_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x08] + +v_sub_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x08] + +v_sub_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x08] + +v_sub_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x08] + +v_sub_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x08] + +v_sub_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x08] + +v_sub_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x08] + +v_sub_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x08] + +v_sub_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x09,0x56,0x34,0x12,0xaf] + +v_sub_nc_u32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x4c] + +v_sub_nc_u32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x4c] + +v_sub_nc_u32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x4c] + +v_sub_nc_u32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x4c] + +v_sub_nc_u32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x4d,0x56,0x34,0x12,0xaf] + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo +// W32: encoding: [0x01,0x05,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v255, v2, vcc_lo +// W32: encoding: [0xff,0x05,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, s1, v2, vcc_lo +// W32: encoding: [0x01,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, s105, v2, vcc_lo +// W32: encoding: [0x69,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, vcc_lo, v2, vcc_lo +// W32: encoding: [0x6a,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, vcc_hi, v2, vcc_lo +// W32: encoding: [0x6b,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, ttmp15, v2, vcc_lo +// W32: encoding: [0x7b,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, m0, v2, vcc_lo +// W32: encoding: [0x7d,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, exec_lo, v2, vcc_lo +// W32: encoding: [0x7e,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, exec_hi, v2, vcc_lo +// W32: encoding: [0x7f,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, null, v2, vcc_lo +// W32: encoding: [0x7c,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, -1, v2, vcc_lo +// W32: encoding: [0xc1,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, 0.5, v2, vcc_lo +// W32: encoding: [0xf0,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, src_scc, v2, vcc_lo +// W32: encoding: [0xfd,0x04,0x0a,0x44] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc_lo, 0xaf123456, v255, vcc_lo +// W32: encoding: [0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc +// W64: encoding: [0x01,0x05,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v255, v2, vcc +// W64: encoding: [0xff,0x05,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, s1, v2, vcc +// W64: encoding: [0x01,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, s105, v2, vcc +// W64: encoding: [0x69,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, vcc_lo, v2, vcc +// W64: encoding: [0x6a,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, vcc_hi, v2, vcc +// W64: encoding: [0x6b,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, ttmp15, v2, vcc +// W64: encoding: [0x7b,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, m0, v2, vcc +// W64: encoding: [0x7d,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, exec_lo, v2, vcc +// W64: encoding: [0x7e,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, exec_hi, v2, vcc +// W64: encoding: [0x7f,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, null, v2, vcc +// W64: encoding: [0x7c,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, -1, v2, vcc +// W64: encoding: [0xc1,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, 0.5, v2, vcc +// W64: encoding: [0xf0,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, src_scc, v2, vcc +// W64: encoding: [0xfd,0x04,0x0a,0x44] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc, 0xaf123456, v255, vcc +// W64: encoding: [0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_f16 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x68] + +v_subrev_f16 v5, v127, v2 +// GFX12: encoding: [0x7f,0x05,0x0a,0x68] + +v_subrev_f16 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x68] + +v_subrev_f16 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x68] + +v_subrev_f16 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x68] + +v_subrev_f16 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x68] + +v_subrev_f16 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x68] + +v_subrev_f16 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x68] + +v_subrev_f16 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x68] + +v_subrev_f16 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x68] + +v_subrev_f16 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x68] + +v_subrev_f16 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x68] + +v_subrev_f16 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x68] + +v_subrev_f16 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x68] + +v_subrev_f16 v127, 0xfe0b, v127 +// GFX12: encoding: [0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00] + +v_subrev_f32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x0a] + +v_subrev_f32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x0a] + +v_subrev_f32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x0a] + +v_subrev_f32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x0a] + +v_subrev_f32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x0a] + +v_subrev_f32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x0a] + +v_subrev_f32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x0a] + +v_subrev_f32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x0a] + +v_subrev_f32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x0a] + +v_subrev_f32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x0a] + +v_subrev_f32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x0a] + +v_subrev_f32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x0a] + +v_subrev_f32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x0a] + +v_subrev_f32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x0a] + +v_subrev_f32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x0b,0x56,0x34,0x12,0xaf] + +v_subrev_nc_u32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x4e] + +v_subrev_nc_u32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x4e] + +v_subrev_nc_u32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x4e] + +v_subrev_nc_u32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x4f,0x56,0x34,0x12,0xaf] + +v_xnor_b32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x3c] + +v_xnor_b32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x3c] + +v_xnor_b32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x3c] + +v_xnor_b32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x3c] + +v_xnor_b32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x3c] + +v_xnor_b32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x3c] + +v_xnor_b32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x3c] + +v_xnor_b32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x3c] + +v_xnor_b32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x3c] + +v_xnor_b32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x3c] + +v_xnor_b32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x3c] + +v_xnor_b32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x3c] + +v_xnor_b32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x3c] + +v_xnor_b32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x3c] + +v_xnor_b32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x3d,0x56,0x34,0x12,0xaf] + +v_xor_b32 v5, v1, v2 +// GFX12: encoding: [0x01,0x05,0x0a,0x3a] + +v_xor_b32 v5, v255, v2 +// GFX12: encoding: [0xff,0x05,0x0a,0x3a] + +v_xor_b32 v5, s1, v2 +// GFX12: encoding: [0x01,0x04,0x0a,0x3a] + +v_xor_b32 v5, s105, v2 +// GFX12: encoding: [0x69,0x04,0x0a,0x3a] + +v_xor_b32 v5, vcc_lo, v2 +// GFX12: encoding: [0x6a,0x04,0x0a,0x3a] + +v_xor_b32 v5, vcc_hi, v2 +// GFX12: encoding: [0x6b,0x04,0x0a,0x3a] + +v_xor_b32 v5, ttmp15, v2 +// GFX12: encoding: [0x7b,0x04,0x0a,0x3a] + +v_xor_b32 v5, m0, v2 +// GFX12: encoding: [0x7d,0x04,0x0a,0x3a] + +v_xor_b32 v5, exec_lo, v2 +// GFX12: encoding: [0x7e,0x04,0x0a,0x3a] + +v_xor_b32 v5, exec_hi, v2 +// GFX12: encoding: [0x7f,0x04,0x0a,0x3a] + +v_xor_b32 v5, null, v2 +// GFX12: encoding: [0x7c,0x04,0x0a,0x3a] + +v_xor_b32 v5, -1, v2 +// GFX12: encoding: [0xc1,0x04,0x0a,0x3a] + +v_xor_b32 v5, 0.5, v2 +// GFX12: encoding: [0xf0,0x04,0x0a,0x3a] + +v_xor_b32 v5, src_scc, v2 +// GFX12: encoding: [0xfd,0x04,0x0a,0x3a] + +v_xor_b32 v255, 0xaf123456, v255 +// GFX12: encoding: [0xff,0xfe,0xff,0x3b,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s index 08d4be0881319..5593ea77d9424 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo // W32: encoding: [0x01,0x05,0x0a,0x40] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases-fake16.s new file mode 100644 index 0000000000000..ebab0859b3484 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases-fake16.s @@ -0,0 +1,19 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s + +v_min_f32 v5, v1, v2 +// GFX12: v_min_num_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2a] + +v_max_f32 v5, v1, v2 +// GFX12: v_max_num_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2c] + +v_min_f16 v5, v1, v2 +// GFX12: v_min_num_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x60] + +v_max_f16 v5, v1, v2 +// GFX12: v_max_num_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x62] + +v_max_f64 v[5:6], v[1:2], v[2:3] +// GFX12: v_max_num_f64_e32 v[5:6], v[1:2], v[2:3] ; encoding: [0x01,0x05,0x0a,0x1c] + +v_min_f64 v[5:6], v[1:2], v[2:3] +// GFX12: v_min_num_f64_e32 v[5:6], v[1:2], v[2:3] ; encoding: [0x01,0x05,0x0a,0x1a] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases.s index 3918dd48cfc06..b7e51cf270647 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_aliases.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s v_min_f32 v5, v1, v2 // GFX12: v_min_num_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2a] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16-fake16.s new file mode 100644 index 0000000000000..53373d1f46973 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16-fake16.s @@ -0,0 +1,2006 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s + +v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_half_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:1 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:15 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:1 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:15 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:1 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:15 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W32: encoding: [0xfa,0xfe,0xff,0x41,0xff,0x6f,0x05,0x30] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_half_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:1 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:15 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:1 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:15 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:1 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:15 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W64: encoding: [0xfa,0xfe,0xff,0x41,0xff,0x6f,0x05,0x30] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] + +v_add_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff] + +v_add_f16 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff] + +v_add_f16 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff] + +v_add_f16 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff] + +v_add_f16 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff] + +v_add_f16 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff] + +v_add_f16 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff] + +v_add_f16 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff] + +v_add_f16 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff] + +v_add_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff] + +v_add_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01] + +v_add_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x64,0x01,0x60,0x09,0x13] + +v_add_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xf5,0x30] + +v_add_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff] + +v_add_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xff] + +v_add_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0xff] + +v_add_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0xff] + +v_add_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0xff] + +v_add_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0xff] + +v_add_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0xff] + +v_add_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0xff] + +v_add_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0xff] + +v_add_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0xff] + +v_add_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x50,0x01,0xff] + +v_add_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x5f,0x01,0x01] + +v_add_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x06,0x01,0x60,0x09,0x13] + +v_add_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x07,0xff,0x6f,0xf5,0x30] + +v_add_nc_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1b,0x00,0xff] + +v_add_nc_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0xff] + +v_add_nc_u32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x40,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x41,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x01,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x0f,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x11,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1f,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x21,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x2f,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x50,0x01,0xff] + +v_add_nc_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x5f,0x01,0x01] + +v_add_nc_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x60,0x09,0x13] + +v_add_nc_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x4b,0xff,0x6f,0x05,0x30] + +v_and_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1b,0x00,0xff] + +v_and_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0xe4,0x00,0xff] + +v_and_b32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x40,0x01,0xff] + +v_and_b32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x41,0x01,0xff] + +v_and_b32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x01,0x01,0xff] + +v_and_b32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x0f,0x01,0xff] + +v_and_b32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x11,0x01,0xff] + +v_and_b32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1f,0x01,0xff] + +v_and_b32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x21,0x01,0xff] + +v_and_b32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x2f,0x01,0xff] + +v_and_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x50,0x01,0xff] + +v_and_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x5f,0x01,0x01] + +v_and_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x36,0x01,0x60,0x09,0x13] + +v_and_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x37,0xff,0x6f,0x05,0x30] + +v_ashrrev_i32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1b,0x00,0xff] + +v_ashrrev_i32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0xe4,0x00,0xff] + +v_ashrrev_i32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x40,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x41,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x01,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x0f,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x11,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1f,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x21,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x2f,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x50,0x01,0xff] + +v_ashrrev_i32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x5f,0x01,0x01] + +v_ashrrev_i32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x34,0x01,0x60,0x09,0x13] + +v_ashrrev_i32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x35,0xff,0x6f,0x05,0x30] + +v_cndmask_b32 v5, v1, v2, vcc_lo quad_perm:[3,2,1,0] +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo quad_perm:[0,1,2,3] +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_half_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_shl:1 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_shl:15 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_shr:1 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_shr:15 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_ror:1 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_ror:15 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W32: encoding: [0xfa,0xfe,0xff,0x03,0xff,0x6f,0x05,0x30] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc quad_perm:[3,2,1,0] +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc quad_perm:[0,1,2,3] +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_half_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_shl:1 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_shl:15 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_shr:1 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_shr:15 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_ror:1 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_ror:15 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: encoding: [0xfa,0x04,0x0a,0x02,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W64: encoding: [0xfa,0xfe,0xff,0x03,0xff,0x6f,0x05,0x30] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x60,0x09,0x13] + +v_cvt_pk_rtz_f16_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xf5,0x30] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x60,0x09,0x13] + +v_cvt_pkrtz_f16_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xf5,0x30] + +v_fmac_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff] + +v_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff] + +v_fmac_f16 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff] + +v_fmac_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01] + +v_fmac_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x09,0x13] + +v_fmac_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xf5,0x30] + +v_fmac_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff] + +v_fmac_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xff] + +v_fmac_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x50,0x01,0xff] + +v_fmac_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x5f,0x01,0x01] + +v_fmac_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x56,0x01,0x60,0x09,0x13] + +v_fmac_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x57,0xff,0x6f,0xf5,0x30] + +v_ldexp_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff] + +v_ldexp_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff] + +v_ldexp_f16 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff] + +v_ldexp_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01] + +v_ldexp_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x60,0x09,0x13] + +v_ldexp_f16 v127, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x35,0x30] + +v_lshlrev_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1b,0x00,0xff] + +v_lshlrev_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0xe4,0x00,0xff] + +v_lshlrev_b32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x40,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x41,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x01,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x0f,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x11,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1f,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x21,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x2f,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x50,0x01,0xff] + +v_lshlrev_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x5f,0x01,0x01] + +v_lshlrev_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x30,0x01,0x60,0x09,0x13] + +v_lshlrev_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x31,0xff,0x6f,0x05,0x30] + +v_lshrrev_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1b,0x00,0xff] + +v_lshrrev_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0xe4,0x00,0xff] + +v_lshrrev_b32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x40,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x41,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x01,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x0f,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x11,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1f,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x21,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x2f,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x50,0x01,0xff] + +v_lshrrev_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x5f,0x01,0x01] + +v_lshrrev_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x32,0x01,0x60,0x09,0x13] + +v_lshrrev_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x33,0xff,0x6f,0x05,0x30] + +v_max_num_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x1b,0x00,0xff] + +v_max_num_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0xff] + +v_max_num_f16 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x40,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x41,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x01,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x0f,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x11,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x1f,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x21,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x2f,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x50,0x01,0xff] + +v_max_num_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x5f,0x01,0x01] + +v_max_num_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x62,0x01,0x60,0x09,0x13] + +v_max_num_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xfe,0x62,0x7f,0x6f,0xf5,0x30] + +v_max_num_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x1b,0x00,0xff] + +v_max_num_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0xff] + +v_max_num_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x40,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x41,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x01,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x0f,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x11,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x1f,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x21,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x2f,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x50,0x01,0xff] + +v_max_num_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x5f,0x01,0x01] + +v_max_num_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x60,0x09,0x13] + +v_max_num_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x2d,0xff,0x6f,0xf5,0x30] + +v_max_i32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0xff] + +v_max_i32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xff] + +v_max_i32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0xff] + +v_max_i32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0xff] + +v_max_i32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0xff] + +v_max_i32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0xff] + +v_max_i32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0xff] + +v_max_i32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0xff] + +v_max_i32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0xff] + +v_max_i32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0xff] + +v_max_i32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x50,0x01,0xff] + +v_max_i32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x5f,0x01,0x01] + +v_max_i32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x24,0x01,0x60,0x09,0x13] + +v_max_i32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x25,0xff,0x6f,0x05,0x30] + +v_max_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0xff] + +v_max_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xff] + +v_max_u32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0xff] + +v_max_u32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0xff] + +v_max_u32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0xff] + +v_max_u32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0xff] + +v_max_u32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0xff] + +v_max_u32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0xff] + +v_max_u32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0xff] + +v_max_u32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0xff] + +v_max_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x50,0x01,0xff] + +v_max_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x5f,0x01,0x01] + +v_max_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x28,0x01,0x60,0x09,0x13] + +v_max_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x29,0xff,0x6f,0x05,0x30] + +v_min_num_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x1b,0x00,0xff] + +v_min_num_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0xff] + +v_min_num_f16 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x40,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x41,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x01,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x0f,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x11,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x1f,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x21,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x2f,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x50,0x01,0xff] + +v_min_num_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x5f,0x01,0x01] + +v_min_num_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x60,0x01,0x60,0x09,0x13] + +v_min_num_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xfe,0x60,0x7f,0x6f,0xf5,0x30] + +v_min_num_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x1b,0x00,0xff] + +v_min_num_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0xff] + +v_min_num_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x40,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x41,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x01,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x0f,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x11,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x1f,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x21,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x2f,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x50,0x01,0xff] + +v_min_num_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x5f,0x01,0x01] + +v_min_num_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x60,0x09,0x13] + +v_min_num_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x2b,0xff,0x6f,0xf5,0x30] + +v_min_i32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0xff] + +v_min_i32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xff] + +v_min_i32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0xff] + +v_min_i32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0xff] + +v_min_i32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0xff] + +v_min_i32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0xff] + +v_min_i32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0xff] + +v_min_i32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0xff] + +v_min_i32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0xff] + +v_min_i32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0xff] + +v_min_i32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x50,0x01,0xff] + +v_min_i32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x5f,0x01,0x01] + +v_min_i32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x22,0x01,0x60,0x09,0x13] + +v_min_i32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x23,0xff,0x6f,0x05,0x30] + +v_min_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0xff] + +v_min_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xff] + +v_min_u32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0xff] + +v_min_u32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0xff] + +v_min_u32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0xff] + +v_min_u32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0xff] + +v_min_u32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0xff] + +v_min_u32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0xff] + +v_min_u32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0xff] + +v_min_u32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0xff] + +v_min_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x50,0x01,0xff] + +v_min_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x5f,0x01,0x01] + +v_min_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x26,0x01,0x60,0x09,0x13] + +v_min_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x27,0xff,0x6f,0x05,0x30] + +v_mul_dx9_zero_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01] + +v_mul_dx9_zero_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x60,0x09,0x13] + +v_mul_dx9_zero_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xf5,0x30] + +v_mul_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff] + +v_mul_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff] + +v_mul_f16 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff] + +v_mul_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01] + +v_mul_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x60,0x09,0x13] + +v_mul_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xf5,0x30] + +v_mul_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0xff] + +v_mul_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xff] + +v_mul_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x50,0x01,0xff] + +v_mul_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x5f,0x01,0x01] + +v_mul_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x10,0x01,0x60,0x09,0x13] + +v_mul_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x11,0xff,0x6f,0xf5,0x30] + +v_mul_hi_i32_i24 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x50,0x01,0xff] + +v_mul_hi_i32_i24 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x5f,0x01,0x01] + +v_mul_hi_i32_i24 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x14,0x01,0x60,0x09,0x13] + +v_mul_hi_i32_i24 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x15,0xff,0x6f,0x05,0x30] + +v_mul_hi_u32_u24 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x50,0x01,0xff] + +v_mul_hi_u32_u24 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x5f,0x01,0x01] + +v_mul_hi_u32_u24 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x18,0x01,0x60,0x09,0x13] + +v_mul_hi_u32_u24 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x19,0xff,0x6f,0x05,0x30] + +v_mul_i32_i24 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0xff] + +v_mul_i32_i24 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xff] + +v_mul_i32_i24 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x50,0x01,0xff] + +v_mul_i32_i24 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x5f,0x01,0x01] + +v_mul_i32_i24 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x12,0x01,0x60,0x09,0x13] + +v_mul_i32_i24 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x13,0xff,0x6f,0x05,0x30] + +v_mul_dx9_zero_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff] + +v_mul_dx9_zero_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01] + +v_mul_dx9_zero_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x60,0x09,0x13] + +v_mul_dx9_zero_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xf5,0x30] + +v_mul_u32_u24 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0xff] + +v_mul_u32_u24 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xff] + +v_mul_u32_u24 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x50,0x01,0xff] + +v_mul_u32_u24 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x5f,0x01,0x01] + +v_mul_u32_u24 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x16,0x01,0x60,0x09,0x13] + +v_mul_u32_u24 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x17,0xff,0x6f,0x05,0x30] + +v_or_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0xff] + +v_or_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xff] + +v_or_b32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0xff] + +v_or_b32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0xff] + +v_or_b32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0xff] + +v_or_b32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0xff] + +v_or_b32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0xff] + +v_or_b32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0xff] + +v_or_b32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0xff] + +v_or_b32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0xff] + +v_or_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x50,0x01,0xff] + +v_or_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x5f,0x01,0x01] + +v_or_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x38,0x01,0x60,0x09,0x13] + +v_or_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x39,0xff,0x6f,0x05,0x30] + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_half_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:1 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:15 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:1 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:15 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:1 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:15 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W32: encoding: [0xfa,0xfe,0xff,0x43,0xff,0x6f,0x05,0x30] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_half_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:1 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:15 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:1 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:15 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:1 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:15 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: encoding: [0xfa,0x04,0x0a,0x42,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W64: encoding: [0xfa,0xfe,0xff,0x43,0xff,0x6f,0x05,0x30] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff] + +v_sub_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff] + +v_sub_f16 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff] + +v_sub_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01] + +v_sub_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x66,0x01,0x60,0x09,0x13] + +v_sub_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xf5,0x30] + +v_sub_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0xff] + +v_sub_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xff] + +v_sub_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x50,0x01,0xff] + +v_sub_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x5f,0x01,0x01] + +v_sub_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x08,0x01,0x60,0x09,0x13] + +v_sub_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x09,0xff,0x6f,0xf5,0x30] + +v_sub_nc_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0xff] + +v_sub_nc_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xff] + +v_sub_nc_u32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x50,0x01,0xff] + +v_sub_nc_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x5f,0x01,0x01] + +v_sub_nc_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x60,0x09,0x13] + +v_sub_nc_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x4d,0xff,0x6f,0x05,0x30] + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_half_mirror +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:1 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shl:15 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:1 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_shr:15 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:1 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_ror:15 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W32: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x60,0x09,0x13] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W32: encoding: [0xfa,0xfe,0xff,0x45,0xff,0x6f,0x05,0x30] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_half_mirror +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:1 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_shl:15 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:1 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_shr:15 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:1 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_ror:15 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// W64: encoding: [0xfa,0x04,0x0a,0x44,0x01,0x60,0x09,0x13] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// W64: encoding: [0xfa,0xfe,0xff,0x45,0xff,0x6f,0x05,0x30] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_f16 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff] + +v_subrev_f16 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff] + +v_subrev_f16 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff] + +v_subrev_f16 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01] + +v_subrev_f16 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x68,0x01,0x60,0x09,0x13] + +v_subrev_f16 v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xf5,0x30] + +v_subrev_f32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0xff] + +v_subrev_f32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xff] + +v_subrev_f32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x50,0x01,0xff] + +v_subrev_f32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x5f,0x01,0x01] + +v_subrev_f32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x60,0x09,0x13] + +v_subrev_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x0b,0xff,0x6f,0xf5,0x30] + +v_subrev_nc_u32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0xff] + +v_subrev_nc_u32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x50,0x01,0xff] + +v_subrev_nc_u32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x5f,0x01,0x01] + +v_subrev_nc_u32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x60,0x09,0x13] + +v_subrev_nc_u32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x4f,0xff,0x6f,0x05,0x30] + +v_xnor_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0xff] + +v_xnor_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xff] + +v_xnor_b32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x50,0x01,0xff] + +v_xnor_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x5f,0x01,0x01] + +v_xnor_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x60,0x09,0x13] + +v_xnor_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x3d,0xff,0x6f,0x05,0x30] + +v_xor_b32 v5, v1, v2 quad_perm:[3,2,1,0] +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0xff] + +v_xor_b32 v5, v1, v2 quad_perm:[0,1,2,3] +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xff] + +v_xor_b32 v5, v1, v2 row_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_half_mirror +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_shl:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_shl:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_shr:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_shr:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_ror:1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_ror:15 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x50,0x01,0xff] + +v_xor_b32 v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x5f,0x01,0x01] + +v_xor_b32 v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX12: encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x60,0x09,0x13] + +v_xor_b32 v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX12: encoding: [0xfa,0xfe,0xff,0x3b,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s index 63ffdbe821af8..a0f93f459f915 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp16.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] // W32: encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8-fake16.s new file mode 100644 index 0000000000000..a7a035f4a9efc --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8-fake16.s @@ -0,0 +1,433 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s + +v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: encoding: [0xea,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W32: encoding: [0xe9,0xfe,0xff,0x41,0xff,0x00,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: encoding: [0xea,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_co_ci_u32 v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W64: encoding: [0xe9,0xfe,0xff,0x41,0xff,0x00,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_add_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] + +v_add_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] + +v_add_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00] + +v_add_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] + +v_add_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] + +v_add_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x07,0xff,0x00,0x00,0x00] + +v_add_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05] + +v_add_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05] + +v_add_nc_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x4b,0xff,0x00,0x00,0x00] + +v_and_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x36,0x01,0x77,0x39,0x05] + +v_and_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x36,0x01,0x77,0x39,0x05] + +v_and_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x37,0xff,0x00,0x00,0x00] + +v_ashrrev_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x34,0x01,0x77,0x39,0x05] + +v_ashrrev_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x34,0x01,0x77,0x39,0x05] + +v_ashrrev_i32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x35,0xff,0x00,0x00,0x00] + +v_cndmask_b32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: encoding: [0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: encoding: [0xea,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W32: encoding: [0xe9,0xfe,0xff,0x03,0xff,0x00,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: encoding: [0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: encoding: [0xea,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cndmask_b32 v255, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W64: encoding: [0xe9,0xfe,0xff,0x03,0xff,0x00,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] + +v_cvt_pk_rtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] + +v_cvt_pk_rtz_f16_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] + +v_cvt_pkrtz_f16_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] + +v_cvt_pkrtz_f16_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00] + +v_fmac_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05] + +v_fmac_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05] + +v_fmac_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00] + +v_fmac_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05] + +v_fmac_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x56,0x01,0x77,0x39,0x05] + +v_fmac_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x57,0xff,0x00,0x00,0x00] + +v_ldexp_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] + +v_ldexp_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] + +v_ldexp_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00] + +v_lshlrev_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x30,0x01,0x77,0x39,0x05] + +v_lshlrev_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x30,0x01,0x77,0x39,0x05] + +v_lshlrev_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x31,0xff,0x00,0x00,0x00] + +v_lshrrev_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x32,0x01,0x77,0x39,0x05] + +v_lshrrev_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x32,0x01,0x77,0x39,0x05] + +v_lshrrev_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x33,0xff,0x00,0x00,0x00] + +v_max_num_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x62,0x01,0x77,0x39,0x05] + +v_max_num_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x62,0x01,0x77,0x39,0x05] + +v_max_num_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xfe,0x62,0x7f,0x00,0x00,0x00] + +v_max_num_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x2c,0x01,0x77,0x39,0x05] + +v_max_num_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x2c,0x01,0x77,0x39,0x05] + +v_max_num_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x2d,0xff,0x00,0x00,0x00] + +v_max_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x24,0x01,0x77,0x39,0x05] + +v_max_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x24,0x01,0x77,0x39,0x05] + +v_max_i32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x25,0xff,0x00,0x00,0x00] + +v_max_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x28,0x01,0x77,0x39,0x05] + +v_max_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x28,0x01,0x77,0x39,0x05] + +v_max_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x29,0xff,0x00,0x00,0x00] + +v_min_num_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x60,0x01,0x77,0x39,0x05] + +v_min_num_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x60,0x01,0x77,0x39,0x05] + +v_min_num_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xfe,0x60,0x7f,0x00,0x00,0x00] + +v_min_num_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x2a,0x01,0x77,0x39,0x05] + +v_min_num_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x2a,0x01,0x77,0x39,0x05] + +v_min_num_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x2b,0xff,0x00,0x00,0x00] + +v_min_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x22,0x01,0x77,0x39,0x05] + +v_min_i32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x22,0x01,0x77,0x39,0x05] + +v_min_i32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x23,0xff,0x00,0x00,0x00] + +v_min_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x26,0x01,0x77,0x39,0x05] + +v_min_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x26,0x01,0x77,0x39,0x05] + +v_min_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x27,0xff,0x00,0x00,0x00] + +v_mul_dx9_zero_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] + +v_mul_dx9_zero_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] + +v_mul_dx9_zero_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00] + +v_mul_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] + +v_mul_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] + +v_mul_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00] + +v_mul_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x10,0x01,0x77,0x39,0x05] + +v_mul_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x10,0x01,0x77,0x39,0x05] + +v_mul_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x11,0xff,0x00,0x00,0x00] + +v_mul_hi_i32_i24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x14,0x01,0x77,0x39,0x05] + +v_mul_hi_i32_i24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x14,0x01,0x77,0x39,0x05] + +v_mul_hi_i32_i24 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x15,0xff,0x00,0x00,0x00] + +v_mul_hi_u32_u24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x18,0x01,0x77,0x39,0x05] + +v_mul_hi_u32_u24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x18,0x01,0x77,0x39,0x05] + +v_mul_hi_u32_u24 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x19,0xff,0x00,0x00,0x00] + +v_mul_i32_i24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x12,0x01,0x77,0x39,0x05] + +v_mul_i32_i24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x12,0x01,0x77,0x39,0x05] + +v_mul_i32_i24 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x13,0xff,0x00,0x00,0x00] + +v_mul_dx9_zero_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] + +v_mul_dx9_zero_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] + +v_mul_dx9_zero_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00] + +v_mul_u32_u24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x16,0x01,0x77,0x39,0x05] + +v_mul_u32_u24 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x16,0x01,0x77,0x39,0x05] + +v_mul_u32_u24 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x17,0xff,0x00,0x00,0x00] + +v_or_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x38,0x01,0x77,0x39,0x05] + +v_or_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x38,0x01,0x77,0x39,0x05] + +v_or_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x39,0xff,0x00,0x00,0x00] + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: encoding: [0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: encoding: [0xea,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W32: encoding: [0xe9,0xfe,0xff,0x43,0xff,0x00,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: encoding: [0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: encoding: [0xea,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_co_ci_u32 v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W64: encoding: [0xe9,0xfe,0xff,0x43,0xff,0x00,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_sub_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] + +v_sub_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] + +v_sub_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00] + +v_sub_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x08,0x01,0x77,0x39,0x05] + +v_sub_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x08,0x01,0x77,0x39,0x05] + +v_sub_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x09,0xff,0x00,0x00,0x00] + +v_sub_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05] + +v_sub_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05] + +v_sub_nc_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x4d,0xff,0x00,0x00,0x00] + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] +// W32: encoding: [0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W32: encoding: [0xea,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W32: encoding: [0xe9,0xfe,0xff,0x45,0xff,0x00,0x00,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] +// W64: encoding: [0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] fi:1 +// W64: encoding: [0xea,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_co_ci_u32 v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:0 +// W64: encoding: [0xe9,0xfe,0xff,0x45,0xff,0x00,0x00,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +v_subrev_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] + +v_subrev_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] + +v_subrev_f16 v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00] + +v_subrev_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05] + +v_subrev_f32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05] + +v_subrev_f32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x0b,0xff,0x00,0x00,0x00] + +v_subrev_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] + +v_subrev_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] + +v_subrev_nc_u32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x4f,0xff,0x00,0x00,0x00] + +v_xnor_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05] + +v_xnor_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05] + +v_xnor_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x3d,0xff,0x00,0x00,0x00] + +v_xor_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: encoding: [0xe9,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05] + +v_xor_b32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX12: encoding: [0xea,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05] + +v_xor_b32 v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX12: encoding: [0xe9,0xfe,0xff,0x3b,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s index 54baafb5366ff..81fcb323e2711 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_dpp8.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s -// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W32 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12,W64 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -filetype=null %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] // W32: encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s index 045d698bd504b..b339bc1960f3e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s @@ -1,226 +1,227 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s -v_add_f16_e32 v255, v1, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmaak_f16_e32 v255, v1, v2, 0xfe0b -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_e32 v255, v1, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmamk_f16_e32 v255, v1, 0xfe0b, v3 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_e32 v255, v1, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_num_f16_e32 v255, v1, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_add_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_num_f16_e32 v255, v1, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_add_f16_e32 v255, v1, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_e32 v255, v1, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_e32 v5, v1, v255 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_e32 v255, v1, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_add_f16_e32 v5, v255, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_e32 v255, v1, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmaak_f16_e32 v255, v1, v2, 0xfe0b +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_e32 v5, v255, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmaak_f16_e32 v5, v1, v255, 0xfe0b +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode v_fmaak_f16_e32 v5, v255, v2, 0xfe0b -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_e32 v5, v255, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode - -v_fmamk_f16_e32 v5, v255, 0xfe0b, v3 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_e32 v5, v255, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_num_f16_e32 v5, v255, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_num_f16_e32 v5, v255, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_e32 v5, v255, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_e32 v5, v255, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_e32 v5, v255, v2 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_e32 v255, v1, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_e32 v5, v1, v255 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_e32 v5, v1, v255 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmaak_f16_e32 v5, v1, v255, 0xfe0b -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmac_f16_e32 v5, v255, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_e32 v5, v1, v255 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_fmamk_f16_e32 v255, v1, 0xfe0b, v3 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode v_fmamk_f16_e32 v5, v1, 0xfe0b, v255 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_num_f16_e32 v5, v1, v255 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_fmamk_f16_e32 v5, v255, 0xfe0b, v3 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_num_f16_e32 v5, v1, v255 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_ldexp_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_e32 v5, v1, v255 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_e32 v5, v1, v255 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_e32 v5, v1, v255 -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_e32 v255, v1, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_ldexp_f16_e32 v5, v255, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_num_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction v_max_num_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_min_num_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_max_num_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction -v_mul_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_num_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction -v_sub_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_num_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction -v_subrev_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_num_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction -v_add_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_num_f16_e32 v255, v1, v2 +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_fmac_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_num_f16_e32 v5, v1, v255 +// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction -v_ldexp_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_max_num_f16_e32 v5, v255, v2 +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction -v_max_num_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_min_num_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_min_num_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_min_num_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_mul_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_num_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction -v_sub_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_num_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction -v_subrev_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_num_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction -v_add_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_num_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction -v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_min_num_f16_e32 v255, v1, v2 +// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction -v_max_num_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_min_num_f16_e32 v5, v1, v255 +// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction -v_min_num_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_min_num_f16_e32 v5, v255, v2 +// GFX12: :[[@LINE-1]]:23: error: invalid operand for instruction -v_mul_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_num_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_mul_f16_e32 v255, v1, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_num_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_mul_f16_e32 v5, v1, v255 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_mul_f16_e32 v5, v255, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode v_sub_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_ldexp_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_num_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_sub_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_num_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_sub_f16_e32 v255, v1, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_e32 v5, v1, v255 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_sub_f16_e32 v5, v255, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_add_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_max_num_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_subrev_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_min_num_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +v_subrev_f16_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_mul_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_dpp v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_e32 v255, v1, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode -v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +v_subrev_f16_e32 v5, v1, v255 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode + +v_subrev_f16_e32 v5, v255, v2 +// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s index 13939842f7303..e9e91fa70773d 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_promote.s @@ -1,190 +1,191 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --sort --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=_e32 %s v_add_f16 v255, v1, v2 -// GFX12: v_add_f16_e64 +// GFX12: v_add_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x32,0xd5,0x01,0x05,0x02,0x00] -v_fmac_f16 v255, v1, v2 -// GFX12: v_fmac_f16_e64 +v_add_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_add_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_ldexp_f16 v255, v1, v2 -// GFX12: v_ldexp_f16_e64 +v_add_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: v_add_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_max_num_f16 v255, v1, v2 -// GFX12: v_max_num_f16_e64 +v_add_f16 v5, v1, v255 +// GFX12: v_add_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x32,0xd5,0x01,0xff,0x03,0x00] -v_min_num_f16 v255, v1, v2 -// GFX12: v_min_num_f16_e64 +v_add_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_add_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_mul_f16 v255, v1, v2 -// GFX12: v_mul_f16_e64 +v_add_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: v_add_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_sub_f16 v255, v1, v2 -// GFX12: v_sub_f16_e64 +v_add_f16 v5, v255, v2 +// GFX12: v_add_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x32,0xd5,0xff,0x05,0x02,0x00] -v_subrev_f16 v255, v1, v2 -// GFX12: v_subrev_f16_e64 +v_add_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_add_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_add_f16 v5, v255, v2 -// GFX12: v_add_f16_e64 +v_add_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: v_add_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] + +v_fmac_f16 v255, v1, v2 +// GFX12: v_fmac_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x36,0xd5,0x01,0x05,0x02,0x00] + +v_fmac_f16 v5, v1, v255 +// GFX12: v_fmac_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x36,0xd5,0x01,0xff,0x03,0x00] v_fmac_f16 v5, v255, v2 -// GFX12: v_fmac_f16_e64 +// GFX12: v_fmac_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x36,0xd5,0xff,0x05,0x02,0x00] -v_ldexp_f16 v5, v255, v2 -// GFX12: v_ldexp_f16_e64 +v_ldexp_f16 v255, v1, v2 +// GFX12: v_ldexp_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x3b,0xd5,0x01,0x05,0x02,0x00] -v_max_num_f16 v5, v255, v2 -// GFX12: v_max_num_f16_e64 +v_ldexp_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_ldexp_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_min_num_f16 v5, v255, v2 -// GFX12: v_min_num_f16_e64 +v_ldexp_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: v_ldexp_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_mul_f16 v5, v255, v2 -// GFX12: v_mul_f16_e64 +v_ldexp_f16 v5, v255, v2 +// GFX12: v_ldexp_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x3b,0xd5,0xff,0x05,0x02,0x00] -v_sub_f16 v5, v255, v2 -// GFX12: v_sub_f16_e64 +v_ldexp_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_ldexp_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_subrev_f16 v5, v255, v2 -// GFX12: v_subrev_f16_e64 +v_ldexp_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: v_ldexp_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_add_f16 v5, v1, v255 -// GFX12: v_add_f16_e64 +v_max_num_f16 v255, v1, v2 +// GFX12: v_max_num_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x31,0xd5,0x01,0x05,0x02,0x00] -v_fmac_f16 v5, v1, v255 -// GFX12: v_fmac_f16_e64 +v_max_num_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max_num_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x31,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] + +v_max_num_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: v_max_num_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] v_max_num_f16 v5, v1, v255 -// GFX12: v_max_num_f16_e64 +// GFX12: v_max_num_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x31,0xd5,0x01,0xff,0x03,0x00] -v_min_num_f16 v5, v1, v255 -// GFX12: v_min_num_f16_e64 +v_max_num_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max_num_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_mul_f16 v5, v1, v255 -// GFX12: v_mul_f16_e64 +v_max_num_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: v_max_num_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_sub_f16 v5, v1, v255 -// GFX12: v_sub_f16_e64 +v_max_num_f16 v5, v255, v2 +// GFX12: v_max_num_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x31,0xd5,0xff,0x05,0x02,0x00] -v_subrev_f16 v5, v1, v255 -// GFX12: v_subrev_f16_e64 +v_max_num_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_max_num_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_add_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_add_f16_e64 +v_max_num_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: v_max_num_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_ldexp_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_ldexp_f16_e64 +v_min_num_f16 v255, v1, v2 +// GFX12: v_min_num_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x30,0xd5,0x01,0x05,0x02,0x00] -v_max_num_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_max_num_f16_e64 +v_min_num_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min_num_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x30,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] v_min_num_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_min_num_f16_e64 +// GFX12: v_min_num_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_mul_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_mul_f16_e64 - -v_sub_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_sub_f16_e64 +v_min_num_f16 v5, v1, v255 +// GFX12: v_min_num_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x30,0xd5,0x01,0xff,0x03,0x00] -v_subrev_f16 v255, v1, v2 quad_perm:[3,2,1,0] -// GFX12: v_subrev_f16_e64 +v_min_num_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min_num_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_add_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_add_f16_e64 +v_min_num_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: v_min_num_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_ldexp_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_ldexp_f16_e64 +v_min_num_f16 v5, v255, v2 +// GFX12: v_min_num_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x30,0xd5,0xff,0x05,0x02,0x00] -v_max_num_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_max_num_f16_e64 +v_min_num_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_min_num_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] v_min_num_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_min_num_f16_e64 +// GFX12: v_min_num_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_mul_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_mul_f16_e64 - -v_sub_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_sub_f16_e64 +v_mul_f16 v255, v1, v2 +// GFX12: v_mul_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x35,0xd5,0x01,0x05,0x02,0x00] -v_subrev_f16 v5, v255, v2 quad_perm:[3,2,1,0] -// GFX12: v_subrev_f16_e64 +v_mul_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_mul_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_add_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_add_f16_e64 +v_mul_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: v_mul_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_max_num_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_max_num_f16_e64 +v_mul_f16 v5, v1, v255 +// GFX12: v_mul_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x35,0xd5,0x01,0xff,0x03,0x00] -v_min_num_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_min_num_f16_e64 +v_mul_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_mul_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x35,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] v_mul_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_mul_f16_e64 - -v_sub_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_sub_f16_e64 +// GFX12: v_mul_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_subrev_f16 v5, v1, v255 quad_perm:[3,2,1,0] -// GFX12: v_subrev_f16_e64 +v_mul_f16 v5, v255, v2 +// GFX12: v_mul_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x35,0xd5,0xff,0x05,0x02,0x00] -v_add_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_add_f16_e64 +v_mul_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_mul_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_ldexp_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_ldexp_f16_e64 +v_mul_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: v_mul_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_max_num_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max_num_f16_e64 +v_sub_f16 v255, v1, v2 +// GFX12: v_sub_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x33,0xd5,0x01,0x05,0x02,0x00] -v_min_num_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min_num_f16_e64 +v_sub_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sub_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_mul_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_mul_f16_e64 +v_sub_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: v_sub_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_sub_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sub_f16_e64 +v_sub_f16 v5, v1, v255 +// GFX12: v_sub_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x33,0xd5,0x01,0xff,0x03,0x00] -v_subrev_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_subrev_f16_e64 +v_sub_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sub_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_add_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_add_f16_e64 +v_sub_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: v_sub_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_ldexp_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_ldexp_f16_e64 +v_sub_f16 v5, v255, v2 +// GFX12: v_sub_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x33,0xd5,0xff,0x05,0x02,0x00] -v_max_num_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max_num_f16_e64 +v_sub_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_sub_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_min_num_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min_num_f16_e64 +v_sub_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: v_sub_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] -v_mul_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_mul_f16_e64 +v_subrev_f16 v255, v1, v2 +// GFX12: v_subrev_f16_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x34,0xd5,0x01,0x05,0x02,0x00] -v_sub_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sub_f16_e64 +v_subrev_f16 v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_subrev_f16_e64_dpp v255, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xff,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05] -v_subrev_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_subrev_f16_e64 +v_subrev_f16 v255, v1, v2 quad_perm:[3,2,1,0] +// GFX12: v_subrev_f16_e64_dpp v255, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xff,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff] -v_add_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_add_f16_e64 +v_subrev_f16 v5, v1, v255 +// GFX12: v_subrev_f16_e64 v5, v1, v255 ; encoding: [0x05,0x00,0x34,0xd5,0x01,0xff,0x03,0x00] -v_max_num_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_max_num_f16_e64 +v_subrev_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_subrev_f16_e64_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd5,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05] -v_min_num_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_min_num_f16_e64 +v_subrev_f16 v5, v1, v255 quad_perm:[3,2,1,0] +// GFX12: v_subrev_f16_e64_dpp v5, v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff] -v_mul_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_mul_f16_e64 +v_subrev_f16 v5, v255, v2 +// GFX12: v_subrev_f16_e64 v5, v255, v2 ; encoding: [0x05,0x00,0x34,0xd5,0xff,0x05,0x02,0x00] -v_sub_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_sub_f16_e64 +v_subrev_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX12: v_subrev_f16_e64_dpp v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05] -v_subrev_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0] -// GFX12: v_subrev_f16_e64 +v_subrev_f16 v5, v255, v2 quad_perm:[3,2,1,0] +// GFX12: v_subrev_f16_e64_dpp v5, v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff] diff --git a/llvm/test/MC/ARM/mve-fp-registers.s b/llvm/test/MC/ARM/mve-fp-registers.s index 886de8c4797e7..4e3cd4440a6fd 100644 --- a/llvm/test/MC/ARM/mve-fp-registers.s +++ b/llvm/test/MC/ARM/mve-fp-registers.s @@ -1,5 +1,5 @@ // Some simple operations on S, D and Q registers (loads, stores and moves) are -// also avaliable in MVE, even in the integer-only version. Some of these +// also available in MVE, even in the integer-only version. Some of these // instructions (operating on D or Q registers, or FP16 values) are only // available for certain targets. diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt index 26ffd3a4e383b..4f638cd8ff54f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt @@ -1,2334 +1,2324 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W32,GFX11-FAKE16 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W32,GFX11-REAL16 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W64 %s +; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W32,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W32,GFX11-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W64,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding %s | FileCheck -strict-whitespace -check-prefixes=GFX11,W64,GFX11-FAKE16 %s +0x01,0x05,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x40] -0x01,0x05,0x0a,0x40 +0xff,0x05,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x40] -0xff,0x05,0x0a,0x40 +0x01,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x40] -0x01,0x04,0x0a,0x40 +0x69,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x40] -0x69,0x04,0x0a,0x40 +0x6a,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x40] -0x6a,0x04,0x0a,0x40 +0x6b,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x40] -0x6b,0x04,0x0a,0x40 +0x7b,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x40] -0x7b,0x04,0x0a,0x40 +0x7d,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x40] -0x7d,0x04,0x0a,0x40 +0x7e,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x40] -0x7e,0x04,0x0a,0x40 +0x7f,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x40] -0x7f,0x04,0x0a,0x40 +0x7c,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x40] -0x7c,0x04,0x0a,0x40 +0xc1,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x40] -0xc1,0x04,0x0a,0x40 +0xf0,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x40] -0xf0,0x04,0x0a,0x40 +0xfd,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x40] -0xfd,0x04,0x0a,0x40 +0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf # W32: v_add_co_ci_u32_e32 v255, vcc_lo, 0xaf123456, v255, vcc_lo ; encoding: [0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf] # W64: v_add_co_ci_u32_e32 v255, vcc, 0xaf123456, v255, vcc ; encoding: [0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf] -0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf +0x01,0x05,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x64] -0x01,0x05,0x0a,0x64 +0x81,0x05,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x64] -0x81,0x05,0x0a,0x64 +0x7f,0x05,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x64] -0x7f,0x05,0x0a,0x64 +0xff,0x05,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x64] -0xff,0x05,0x0a,0x64 +0x01,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x64] -0x01,0x04,0x0a,0x64 +0x69,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x64] -0x69,0x04,0x0a,0x64 +0x6a,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x64] -0x6a,0x04,0x0a,0x64 +0x6b,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x64] -0x6b,0x04,0x0a,0x64 +0x7b,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x64] -0x7b,0x04,0x0a,0x64 +0x7d,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x64] -0x7d,0x04,0x0a,0x64 +0x7e,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x64] -0x7e,0x04,0x0a,0x64 +0x7f,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x64] -0x7f,0x04,0x0a,0x64 +0x7c,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x64] -0x7c,0x04,0x0a,0x64 +0xc1,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x64] -0xc1,0x04,0x0a,0x64 +0xf0,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x64] -0xf0,0x04,0x0a,0x64 +0xfd,0x04,0x0a,0x64 # GFX11-REAL16: v_add_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x64] # GFX11-FAKE16: v_add_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x64] -0xfd,0x04,0x0a,0x64 -# GFX11-REAL16: v_add_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x65] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xfd,0x04,0x0b,0x65 0xfd,0x04,0x0b,0x65 +# GFX11-REAL16: v_add_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x65] +0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00 # GFX11-REAL16: v_add_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00] # GFX11-FAKE16: v_add_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00] -0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00 -# GFX11-REAL16: v_add_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x65,0x0b,0xfe,0x00,0x00] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xff,0xfe,0xff,0x65,0x0b,0xfe,0x00,0x00 0xff,0xfe,0xff,0x65,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_add_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x65,0x0b,0xfe,0x00,0x00] -# GFX11: v_add_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x06] 0x01,0x05,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x06] 0xff,0x05,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x06] 0x01,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x06] 0x69,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x06] 0x6a,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x06] 0x6b,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x06] 0x7b,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x06] 0x7d,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x06] 0x7e,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x06] 0x7f,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x06] 0x7c,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x06] 0xc1,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x06] 0xf0,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x06] 0xfd,0x04,0x0a,0x06 +# GFX11: v_add_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x06] -# GFX11: v_add_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x07,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x07,0x56,0x34,0x12,0xaf +# GFX11: v_add_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x07,0x56,0x34,0x12,0xaf] -# GFX11: v_add_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4a] 0x01,0x05,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4a] 0xff,0x05,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4a] 0x01,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4a] 0x69,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4a] 0x6a,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4a] 0x6b,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4a] 0x7b,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4a] 0x7d,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4a] 0x7e,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4a] 0x7f,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4a] 0x7c,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4a] 0xc1,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4a] 0xf0,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4a] 0xfd,0x04,0x0a,0x4a +# GFX11: v_add_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4a] -# GFX11: v_add_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x4b,0x56,0x34,0x12,0xaf +# GFX11: v_add_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4b,0x56,0x34,0x12,0xaf] -# GFX11: v_and_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x36] 0x01,0x05,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x36] 0xff,0x05,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x36] 0x01,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x36] 0x69,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x36] 0x6a,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x36] 0x6b,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x36] 0x7b,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x36] 0x7d,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x36] 0x7e,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x36] 0x7f,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x36] 0x7c,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x36] 0xc1,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x36] 0xf0,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x36] 0xfd,0x04,0x0a,0x36 +# GFX11: v_and_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x36] -# GFX11: v_and_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x37,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x37,0x56,0x34,0x12,0xaf +# GFX11: v_and_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x37,0x56,0x34,0x12,0xaf] -# GFX11: v_ashrrev_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x34] 0x01,0x05,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x34] 0xff,0x05,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x34] 0x01,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x34] 0x69,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x34] 0x6a,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x34] 0x6b,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x34] 0x7b,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x34] 0x7d,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x34] 0x7e,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x34] 0x7f,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x34] 0x7c,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x34] 0xc1,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x34] 0xf0,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x34] 0xfd,0x04,0x0a,0x34 +# GFX11: v_ashrrev_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x34] -# GFX11: v_ashrrev_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x35,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x35,0x56,0x34,0x12,0xaf +# GFX11: v_ashrrev_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x35,0x56,0x34,0x12,0xaf] +0x01,0x05,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x02] -0x01,0x05,0x0a,0x02 +0xff,0x05,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x02] -0xff,0x05,0x0a,0x02 +0x01,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x02] -0x01,0x04,0x0a,0x02 +0x69,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x02] -0x69,0x04,0x0a,0x02 +0x6a,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x02] -0x6a,0x04,0x0a,0x02 +0x6b,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x02] -0x6b,0x04,0x0a,0x02 +0x7b,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x02] -0x7b,0x04,0x0a,0x02 +0x7d,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x02] -0x7d,0x04,0x0a,0x02 +0x7e,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x02] -0x7e,0x04,0x0a,0x02 +0x7f,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x02] -0x7f,0x04,0x0a,0x02 +0x7c,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x02] -0x7c,0x04,0x0a,0x02 +0xc1,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x02] -0xc1,0x04,0x0a,0x02 +0xf0,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x02] -0xf0,0x04,0x0a,0x02 +0xfd,0x04,0x0a,0x02 # W32: v_cndmask_b32_e32 v5, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x02] # W64: v_cndmask_b32_e32 v5, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x02] -0xfd,0x04,0x0a,0x02 +0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf # W32: v_cndmask_b32_e32 v255, 0xaf123456, v255, vcc_lo ; encoding: [0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf] # W64: v_cndmask_b32_e32 v255, 0xaf123456, v255, vcc ; encoding: [0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf] -0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x5e] 0x01,0x05,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x5e] 0xff,0x05,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x5e] 0x01,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x5e] 0x69,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x5e] 0x6a,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x5e] 0x6b,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x5e] 0x7b,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x5e] 0x7d,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x5e] 0x7e,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x5e] 0x7f,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x5e] 0x7c,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x5e] 0xc1,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x5e] 0xf0,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x5e] 0xfd,0x04,0x0a,0x5e +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x5e] -# GFX11: v_cvt_pk_rtz_f16_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf +# GFX11: v_cvt_pk_rtz_f16_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf] -# GFX11: v_dot2acc_f32_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x04] 0x01,0x05,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x04] 0xff,0x05,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x04] 0x01,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x04] 0x69,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x04] 0x6a,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x04] 0x6b,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x04] 0x7b,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x04] 0x7d,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x04] 0x7e,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x04] 0x7f,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x04] 0x7c,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x04] 0xc1,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x04] 0xf0,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x04] 0xfd,0x04,0x0a,0x04 +# GFX11: v_dot2acc_f32_f16 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x04] -# GFX11: v_dot2acc_f32_f16 v255, 0xfe0b, v255 ; encoding: [0xff,0xfe,0xff,0x05,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xff,0x05,0x0b,0xfe,0x00,0x00 +# GFX11: v_dot2acc_f32_f16 v255, 0xfe0b, v255 ; encoding: [0xff,0xfe,0xff,0x05,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, v1, v2, 0xfe0b ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, v1, v2, 0xfe0b ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, v127, v2, 0xfe0b ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, v127, v2, 0xfe0b ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, s1, v2, 0xfe0b ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, s1, v2, 0xfe0b ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, s105, v2, 0xfe0b ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, s105, v2, 0xfe0b ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, ttmp15, v2, 0xfe0b ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, ttmp15, v2, 0xfe0b ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, m0, v2, 0xfe0b ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, m0, v2, 0xfe0b ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, exec_lo, v2, 0xfe0b ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, exec_lo, v2, 0xfe0b ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, exec_hi, v2, 0xfe0b ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, exec_hi, v2, 0xfe0b ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, null, v2, 0xfe0b ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, null, v2, 0xfe0b ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, -1, v2, 0xfe0b ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, -1, v2, 0xfe0b ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, 0.5, v2, 0xfe0b ; encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, 0.5, v2, 0xfe0b ; encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v5, src_scc, v2, 0xfe0b ; encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v5, src_scc, v2, 0xfe0b ; encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b ; encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b ; encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmaak_f32 v5, v1, v2, 0xaf123456 ; encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, v1, v2, 0xaf123456 ; encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, v255, v2, 0xaf123456 ; encoding: [0xff,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0xff,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, v255, v2, 0xaf123456 ; encoding: [0xff,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, s1, v2, 0xaf123456 ; encoding: [0x01,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x01,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, s1, v2, 0xaf123456 ; encoding: [0x01,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, s105, v2, 0xaf123456 ; encoding: [0x69,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x69,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, s105, v2, 0xaf123456 ; encoding: [0x69,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, vcc_lo, v2, 0xaf123456 ; encoding: [0x6a,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x6a,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, vcc_lo, v2, 0xaf123456 ; encoding: [0x6a,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, vcc_hi, v2, 0xaf123456 ; encoding: [0x6b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x6b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, vcc_hi, v2, 0xaf123456 ; encoding: [0x6b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, ttmp15, v2, 0xaf123456 ; encoding: [0x7b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, ttmp15, v2, 0xaf123456 ; encoding: [0x7b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, m0, v2, 0xaf123456 ; encoding: [0x7d,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7d,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, m0, v2, 0xaf123456 ; encoding: [0x7d,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, exec_lo, v2, 0xaf123456 ; encoding: [0x7e,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7e,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, exec_lo, v2, 0xaf123456 ; encoding: [0x7e,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, exec_hi, v2, 0xaf123456 ; encoding: [0x7f,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7f,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, exec_hi, v2, 0xaf123456 ; encoding: [0x7f,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, null, v2, 0xaf123456 ; encoding: [0x7c,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7c,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, null, v2, 0xaf123456 ; encoding: [0x7c,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, -1, v2, 0xaf123456 ; encoding: [0xc1,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0xc1,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, -1, v2, 0xaf123456 ; encoding: [0xc1,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0xf0,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v5, src_scc, v2, 0xaf123456 ; encoding: [0xfd,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0xfd,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v5, src_scc, v2, 0xaf123456 ; encoding: [0xfd,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX11: v_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 ; encoding: [0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf +# GFX11: v_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 ; encoding: [0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0c] 0x01,0x05,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0c] 0xff,0x05,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0c] 0x01,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0c] 0x69,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0c] 0x6a,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0c] 0x6b,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0c] 0x7b,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0c] 0x7d,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0c] 0x7e,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0c] 0x7f,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0c] 0x7c,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0c] 0xc1,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0c] 0xf0,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0c] 0xfd,0x04,0x0a,0x0c +# GFX11: v_fmac_dx9_zero_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0c] -# GFX11: v_fmac_dx9_zero_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0d,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x0d,0x56,0x34,0x12,0xaf +# GFX11: v_fmac_dx9_zero_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0d,0x56,0x34,0x12,0xaf] -# GFX11: v_fmac_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x6c] 0x01,0x05,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x6c] 0x7f,0x05,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x6c] 0x01,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x6c] 0x69,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x6c] 0x6a,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x6c] 0x6b,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x6c] 0x7b,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x6c] 0x7d,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x6c] 0x7e,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x6c] 0x7f,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x6c] 0x7c,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x6c] 0xc1,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x6c] 0xf0,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x6c] 0xfd,0x04,0x0a,0x6c +# GFX11: v_fmac_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x6c] -# GFX11: v_fmac_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmac_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmac_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x56] 0x01,0x05,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x56] 0xff,0x05,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x56] 0x01,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x56] 0x69,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x56] 0x6a,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x56] 0x6b,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x56] 0x7b,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x56] 0x7d,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x56] 0x7e,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x56] 0x7f,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x56] 0x7c,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x56] 0xc1,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x56] 0xf0,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x56] 0xfd,0x04,0x0a,0x56 +# GFX11: v_fmac_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x56] -# GFX11: v_fmac_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf +# GFX11: v_fmac_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f16 v5, v1, 0xfe0b, v3 ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, v1, 0xfe0b, v3 ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, v127, 0xfe0b, v3 ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, v127, 0xfe0b, v3 ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, s1, 0xfe0b, v3 ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, s1, 0xfe0b, v3 ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, s105, 0xfe0b, v3 ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, s105, 0xfe0b, v3 ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3 ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3 ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3 ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3 ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, ttmp15, 0xfe0b, v3 ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, ttmp15, 0xfe0b, v3 ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, m0, 0xfe0b, v3 ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, m0, 0xfe0b, v3 ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, exec_lo, 0xfe0b, v3 ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, exec_lo, 0xfe0b, v3 ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, exec_hi, 0xfe0b, v3 ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, exec_hi, 0xfe0b, v3 ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, null, 0xfe0b, v3 ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, null, 0xfe0b, v3 ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, -1, 0xfe0b, v3 ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, -1, 0xfe0b, v3 ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, 0.5, 0xfe0b, v3 ; encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, 0.5, 0xfe0b, v3 ; encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v5, src_scc, 0xfe0b, v3 ; encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v5, src_scc, 0xfe0b, v3 ; encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00 +# GFX11: v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00] -# GFX11: v_fmamk_f32 v5, v1, 0xaf123456, v3 ; encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, v1, 0xaf123456, v3 ; encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, v255, 0xaf123456, v3 ; encoding: [0xff,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] 0xff,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, v255, 0xaf123456, v3 ; encoding: [0xff,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, s1, 0xaf123456, v3 ; encoding: [0x01,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x01,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, s1, 0xaf123456, v3 ; encoding: [0x01,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, s105, 0xaf123456, v3 ; encoding: [0x69,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x69,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, s105, 0xaf123456, v3 ; encoding: [0x69,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, vcc_lo, 0xaf123456, v3 ; encoding: [0x6a,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x6a,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, vcc_lo, 0xaf123456, v3 ; encoding: [0x6a,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, vcc_hi, 0xaf123456, v3 ; encoding: [0x6b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x6b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, vcc_hi, 0xaf123456, v3 ; encoding: [0x6b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, ttmp15, 0xaf123456, v3 ; encoding: [0x7b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, ttmp15, 0xaf123456, v3 ; encoding: [0x7b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, m0, 0xaf123456, v3 ; encoding: [0x7d,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7d,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, m0, 0xaf123456, v3 ; encoding: [0x7d,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, exec_lo, 0xaf123456, v3 ; encoding: [0x7e,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7e,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, exec_lo, 0xaf123456, v3 ; encoding: [0x7e,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, exec_hi, 0xaf123456, v3 ; encoding: [0x7f,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7f,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, exec_hi, 0xaf123456, v3 ; encoding: [0x7f,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, null, 0xaf123456, v3 ; encoding: [0x7c,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7c,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, null, 0xaf123456, v3 ; encoding: [0x7c,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, -1, 0xaf123456, v3 ; encoding: [0xc1,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0xc1,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, -1, 0xaf123456, v3 ; encoding: [0xc1,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, 0.5, 0xaf123456, v3 ; encoding: [0xf0,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0xf0,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, 0.5, 0xaf123456, v3 ; encoding: [0xf0,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v5, src_scc, 0xaf123456, v3 ; encoding: [0xfd,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0xfd,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v5, src_scc, 0xaf123456, v3 ; encoding: [0xfd,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX11: v_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf +# GFX11: v_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x76] 0x01,0x05,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x76] 0x7f,0x05,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x76] 0x01,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x76] 0x69,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x76] 0x6a,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x76] 0x6b,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x76] 0x7b,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x76] 0x7d,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x76] 0x7e,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x76] 0x7f,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x76] 0x7c,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x76] 0xc1,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x76] 0xf0,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x76] -# GFX11-REAL16: v_ldexp_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x76] 0xfd,0x04,0x0a,0x76 +# GFX11-REAL16: v_ldexp_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x76] +# GFX11-FAKE16: v_ldexp_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x76] -# GFX11-FAKE16: v_ldexp_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] -# GFX11-REAL16: v_ldexp_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_ldexp_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] +# GFX11-FAKE16: v_ldexp_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] -# GFX11: v_lshlrev_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x30] 0x01,0x05,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x30] 0xff,0x05,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x30] 0x01,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x30] 0x69,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x30] 0x6a,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x30] 0x6b,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x30] 0x7b,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x30] 0x7d,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x30] 0x7e,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x30] 0x7f,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x30] 0x7c,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x30] 0xc1,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x30] 0xf0,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x30] 0xfd,0x04,0x0a,0x30 +# GFX11: v_lshlrev_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x30] -# GFX11: v_lshlrev_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x31,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x31,0x56,0x34,0x12,0xaf +# GFX11: v_lshlrev_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x31,0x56,0x34,0x12,0xaf] -# GFX11: v_lshrrev_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x32] 0x01,0x05,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x32] 0xff,0x05,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x32] 0x01,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x32] 0x69,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x32] 0x6a,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x32] 0x6b,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x32] 0x7b,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x32] 0x7d,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x32] 0x7e,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x32] 0x7f,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x32] 0x7c,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x32] 0xc1,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x32] 0xf0,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x32] 0xfd,0x04,0x0a,0x32 +# GFX11: v_lshrrev_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x32] -# GFX11: v_lshrrev_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x33,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x33,0x56,0x34,0x12,0xaf +# GFX11: v_lshrrev_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x33,0x56,0x34,0x12,0xaf] +0x01,0x05,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x72] -0x01,0x05,0x0a,0x72 +0x81,0x05,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x72] -0x81,0x05,0x0a,0x72 +0x7f,0x05,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x72] -0x7f,0x05,0x0a,0x72 +0xff,0x05,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x72] -0xff,0x05,0x0a,0x72 +0x01,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x72] -0x01,0x04,0x0a,0x72 +0x69,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x72] -0x69,0x04,0x0a,0x72 +0x6a,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x72] -0x6a,0x04,0x0a,0x72 +0x6b,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x72] -0x6b,0x04,0x0a,0x72 +0x7b,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x72] -0x7b,0x04,0x0a,0x72 +0x7d,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x72] -0x7d,0x04,0x0a,0x72 +0x7e,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x72] -0x7e,0x04,0x0a,0x72 +0x7f,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x72] -0x7f,0x04,0x0a,0x72 +0x7c,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x72] -0x7c,0x04,0x0a,0x72 +0xc1,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x72] -0xc1,0x04,0x0a,0x72 +0xf0,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x72] -0xf0,0x04,0x0a,0x72 +0xfd,0x04,0x0a,0x72 # GFX11-REAL16: v_max_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x72] # GFX11-FAKE16: v_max_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x72] -0xfd,0x04,0x0a,0x72 -# GFX11-REAL16: v_max_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x73] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xfd,0x04,0x0b,0x73 0xfd,0x04,0x0b,0x73 +# GFX11-REAL16: v_max_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x73] +0xff,0xfe,0xfe,0x72,0x0b,0xfe,0x00,0x00 # GFX11-REAL16: v_max_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x72,0x0b,0xfe,0x00,0x00] # GFX11-FAKE16: v_max_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x72,0x0b,0xfe,0x00,0x00] -0xff,0xfe,0xfe,0x72,0x0b,0xfe,0x00,0x00 -# GFX11-REAL16: v_max_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x73,0x0b,0xfe,0x00,0x00] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xff,0xfe,0xff,0x73,0x0b,0xfe,0x00,0x00 0xff,0xfe,0xff,0x73,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_max_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x73,0x0b,0xfe,0x00,0x00] -# GFX11: v_max_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x20] 0x01,0x05,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x20] 0xff,0x05,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x20] 0x01,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x20] 0x69,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x20] 0x6a,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x20] 0x6b,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x20] 0x7b,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x20] 0x7d,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x20] 0x7e,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x20] 0x7f,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x20] 0x7c,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x20] 0xc1,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x20] 0xf0,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x20] 0xfd,0x04,0x0a,0x20 +# GFX11: v_max_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x20] -# GFX11: v_max_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x21,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x21,0x56,0x34,0x12,0xaf +# GFX11: v_max_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x21,0x56,0x34,0x12,0xaf] -# GFX11: v_max_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x24] 0x01,0x05,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x24] 0xff,0x05,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x24] 0x01,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x24] 0x69,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x24] 0x6a,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x24] 0x6b,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x24] 0x7b,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x24] 0x7d,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x24] 0x7e,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x24] 0x7f,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x24] 0x7c,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x24] 0xc1,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x24] 0xf0,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x24] 0xfd,0x04,0x0a,0x24 +# GFX11: v_max_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x24] -# GFX11: v_max_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x25,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x25,0x56,0x34,0x12,0xaf +# GFX11: v_max_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x25,0x56,0x34,0x12,0xaf] -# GFX11: v_max_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x28] 0x01,0x05,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x28] 0xff,0x05,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x28] 0x01,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x28] 0x69,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x28] 0x6a,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x28] 0x6b,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x28] 0x7b,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x28] 0x7d,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x28] 0x7e,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x28] 0x7f,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x28] 0x7c,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x28] 0xc1,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x28] 0xf0,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x28] 0xfd,0x04,0x0a,0x28 +# GFX11: v_max_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x28] -# GFX11: v_max_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x29,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x29,0x56,0x34,0x12,0xaf +# GFX11: v_max_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x29,0x56,0x34,0x12,0xaf] +0x01,0x05,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x74] -0x01,0x05,0x0a,0x74 +0x81,0x05,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x74] -0x81,0x05,0x0a,0x74 +0x7f,0x05,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x74] -0x7f,0x05,0x0a,0x74 +0xff,0x05,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x74] -0xff,0x05,0x0a,0x74 +0x01,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x74] -0x01,0x04,0x0a,0x74 +0x69,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x74] -0x69,0x04,0x0a,0x74 +0x6a,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x74] -0x6a,0x04,0x0a,0x74 +0x6b,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x74] -0x6b,0x04,0x0a,0x74 +0x7b,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x74] -0x7b,0x04,0x0a,0x74 +0x7d,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x74] -0x7d,0x04,0x0a,0x74 +0x7e,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x74] -0x7e,0x04,0x0a,0x74 +0x7f,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x74] -0x7f,0x04,0x0a,0x74 +0x7c,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x74] -0x7c,0x04,0x0a,0x74 +0xc1,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x74] -0xc1,0x04,0x0a,0x74 +0xf0,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x74] -0xf0,0x04,0x0a,0x74 +0xfd,0x04,0x0a,0x74 # GFX11-REAL16: v_min_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x74] # GFX11-FAKE16: v_min_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x74] -0xfd,0x04,0x0a,0x74 -# GFX11-REAL16: v_min_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x75] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xfd,0x04,0x0b,0x75 0xfd,0x04,0x0b,0x75 +# GFX11-REAL16: v_min_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x75] +0xff,0xfe,0xfe,0x74,0x0b,0xfe,0x00,0x00 # GFX11-REAL16: v_min_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x74,0x0b,0xfe,0x00,0x00] # GFX11-FAKE16: v_min_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x74,0x0b,0xfe,0x00,0x00] -0xff,0xfe,0xfe,0x74,0x0b,0xfe,0x00,0x00 -# GFX11-REAL16: v_min_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x75,0x0b,0xfe,0x00,0x00] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xff,0xfe,0xff,0x75,0x0b,0xfe,0x00,0x00 0xff,0xfe,0xff,0x75,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_min_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x75,0x0b,0xfe,0x00,0x00] -# GFX11: v_min_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x1e] 0x01,0x05,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x1e] 0xff,0x05,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x1e] 0x01,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x1e] 0x69,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x1e] 0x6a,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x1e] 0x6b,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x1e] 0x7b,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x1e] 0x7d,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x1e] 0x7e,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x1e] 0x7f,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x1e] 0x7c,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x1e] 0xc1,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x1e] 0xf0,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x1e] 0xfd,0x04,0x0a,0x1e +# GFX11: v_min_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x1e] -# GFX11: v_min_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x1f,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x1f,0x56,0x34,0x12,0xaf +# GFX11: v_min_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x1f,0x56,0x34,0x12,0xaf] -# GFX11: v_min_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x22] 0x01,0x05,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x22] 0xff,0x05,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x22] 0x01,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x22] 0x69,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x22] 0x6a,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x22] 0x6b,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x22] 0x7b,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x22] 0x7d,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x22] 0x7e,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x22] 0x7f,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x22] 0x7c,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x22] 0xc1,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x22] 0xf0,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x22] 0xfd,0x04,0x0a,0x22 +# GFX11: v_min_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x22] -# GFX11: v_min_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x23,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x23,0x56,0x34,0x12,0xaf +# GFX11: v_min_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x23,0x56,0x34,0x12,0xaf] -# GFX11: v_min_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x26] 0x01,0x05,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x26] 0xff,0x05,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x26] 0x01,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x26] 0x69,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x26] 0x6a,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x26] 0x6b,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x26] 0x7b,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x26] 0x7d,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x26] 0x7e,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x26] 0x7f,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x26] 0x7c,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x26] 0xc1,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x26] 0xf0,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x26] 0xfd,0x04,0x0a,0x26 +# GFX11: v_min_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x26] -# GFX11: v_min_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x27,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x27,0x56,0x34,0x12,0xaf +# GFX11: v_min_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x27,0x56,0x34,0x12,0xaf] -# GFX11: v_mul_dx9_zero_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0e] 0x01,0x05,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0e] 0xff,0x05,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0e] 0x01,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0e] 0x69,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0e] 0x6a,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0e] 0x6b,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0e] 0x7b,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0e] 0x7d,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0e] 0x7e,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0e] 0x7f,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0e] 0x7c,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0e] 0xc1,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0e] 0xf0,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0e] 0xfd,0x04,0x0a,0x0e +# GFX11: v_mul_dx9_zero_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0e] -# GFX11: v_mul_dx9_zero_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf +# GFX11: v_mul_dx9_zero_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf] +0x01,0x05,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x6a] -0x01,0x05,0x0a,0x6a -# GFX11-REAL16: v_mul_f16_e32 v5.l, v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x6a] -# GFX11-FAKE16: v_mul_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x6a 0x81,0x05,0x0a,0x6a +# GFX11-REAL16: v_mul_f16_e32 v5.l, v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x6a] +# GFX11-FAKE16: v_mul_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x6a] +0x7f,0x05,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x6a] -0x7f,0x05,0x0a,0x6a +0xff,0x05,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x6a] -0xff,0x05,0x0a,0x6a +0x01,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x6a] -0x01,0x04,0x0a,0x6a +0x69,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x6a] -0x69,0x04,0x0a,0x6a +0x6a,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x6a] -0x6a,0x04,0x0a,0x6a +0x6b,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x6a] -0x6b,0x04,0x0a,0x6a +0x7b,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x6a] -0x7b,0x04,0x0a,0x6a +0x7d,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x6a] -0x7d,0x04,0x0a,0x6a +0x7e,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x6a] -0x7e,0x04,0x0a,0x6a +0x7f,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x6a] -0x7f,0x04,0x0a,0x6a +0x7c,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x6a] -0x7c,0x04,0x0a,0x6a +0xc1,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x6a] -0xc1,0x04,0x0a,0x6a +0xf0,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x6a] -0xf0,0x04,0x0a,0x6a +0xfd,0x04,0x0a,0x6a # GFX11-REAL16: v_mul_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x6a] # GFX11-FAKE16: v_mul_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x6a] -0xfd,0x04,0x0a,0x6a -# GFX11-REAL16: v_mul_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x6b] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xfd,0x04,0x0b,0x6b 0xfd,0x04,0x0b,0x6b +# GFX11-REAL16: v_mul_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x6b] +0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00 # GFX11-REAL16: v_mul_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00] # GFX11-FAKE16: v_mul_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00] -0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00 -# GFX11-REAL16: v_mul_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x6b,0x0b,0xfe,0x00,0x00] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xff,0xfe,0xff,0x6b,0x0b,0xfe,0x00,0x00 0xff,0xfe,0xff,0x6b,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_mul_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x6b,0x0b,0xfe,0x00,0x00] -# GFX11: v_mul_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x10] 0x01,0x05,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x10] 0xff,0x05,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x10] 0x01,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x10] 0x69,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x10] 0x6a,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x10] 0x6b,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x10] 0x7b,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x10] 0x7d,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x10] 0x7e,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x10] 0x7f,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x10] 0x7c,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x10] 0xc1,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x10] 0xf0,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x10] 0xfd,0x04,0x0a,0x10 +# GFX11: v_mul_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x10] -# GFX11: v_mul_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x11,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x11,0x56,0x34,0x12,0xaf +# GFX11: v_mul_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x11,0x56,0x34,0x12,0xaf] -# GFX11: v_mul_hi_i32_i24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x14] 0x01,0x05,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x14] 0xff,0x05,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x14] 0x01,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x14] 0x69,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x14] 0x6a,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x14] 0x6b,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x14] 0x7b,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x14] 0x7d,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x14] 0x7e,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x14] 0x7f,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x14] 0x7c,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x14] 0xc1,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x14] 0xf0,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x14] 0xfd,0x04,0x0a,0x14 +# GFX11: v_mul_hi_i32_i24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x14] -# GFX11: v_mul_hi_i32_i24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x15,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x15,0x56,0x34,0x12,0xaf +# GFX11: v_mul_hi_i32_i24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x15,0x56,0x34,0x12,0xaf] -# GFX11: v_mul_hi_u32_u24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x18] 0x01,0x05,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x18] 0xff,0x05,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x18] 0x01,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x18] 0x69,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x18] 0x6a,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x18] 0x6b,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x18] 0x7b,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x18] 0x7d,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x18] 0x7e,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x18] 0x7f,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x18] 0x7c,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x18] 0xc1,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x18] 0xf0,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x18] 0xfd,0x04,0x0a,0x18 +# GFX11: v_mul_hi_u32_u24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x18] -# GFX11: v_mul_hi_u32_u24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x19,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x19,0x56,0x34,0x12,0xaf +# GFX11: v_mul_hi_u32_u24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x19,0x56,0x34,0x12,0xaf] -# GFX11: v_mul_i32_i24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x12] 0x01,0x05,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x12] 0xff,0x05,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x12] 0x01,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x12] 0x69,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x12] 0x6a,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x12] 0x6b,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x12] 0x7b,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x12] 0x7d,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x12] 0x7e,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x12] 0x7f,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x12] 0x7c,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x12] 0xc1,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x12] 0xf0,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x12] 0xfd,0x04,0x0a,0x12 +# GFX11: v_mul_i32_i24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x12] -# GFX11: v_mul_i32_i24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x13,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x13,0x56,0x34,0x12,0xaf +# GFX11: v_mul_i32_i24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x13,0x56,0x34,0x12,0xaf] -# GFX11: v_mul_u32_u24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x16] 0x01,0x05,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x16] 0xff,0x05,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x16] 0x01,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x16] 0x69,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x16] 0x6a,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x16] 0x6b,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x16] 0x7b,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x16] 0x7d,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x16] 0x7e,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x16] 0x7f,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x16] 0x7c,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x16] 0xc1,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x16] 0xf0,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x16] 0xfd,0x04,0x0a,0x16 +# GFX11: v_mul_u32_u24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x16] -# GFX11: v_mul_u32_u24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x17,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x17,0x56,0x34,0x12,0xaf +# GFX11: v_mul_u32_u24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x17,0x56,0x34,0x12,0xaf] -# GFX11: v_or_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x38] 0x01,0x05,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x38] 0xff,0x05,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x38] 0x01,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x38] 0x69,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x38] 0x6a,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x38] 0x6b,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x38] 0x7b,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x38] 0x7d,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x38] 0x7e,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x38] 0x7f,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x38] 0x7c,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x38] 0xc1,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x38] 0xf0,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x38] 0xfd,0x04,0x0a,0x38 +# GFX11: v_or_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x38] -# GFX11: v_or_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x39,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x39,0x56,0x34,0x12,0xaf +# GFX11: v_or_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x39,0x56,0x34,0x12,0xaf] -# GFX11: v_pk_fmac_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] 0x01,0x05,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] 0xff,0x05,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] 0x01,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x78] 0x69,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] 0x6a,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] 0x6b,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x78] 0x7b,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x78] 0x7d,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] 0x7e,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] 0x7f,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x78] 0x7c,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] 0xc1,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] 0xf0,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x78] 0xfd,0x04,0x0a,0x78 +# GFX11: v_pk_fmac_f16 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x78] -# GFX11: v_pk_fmac_f16 v255, 0xfe0b, v255 ; encoding: [0xff,0xfe,0xff,0x79,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xff,0x79,0x0b,0xfe,0x00,0x00 +# GFX11: v_pk_fmac_f16 v255, 0xfe0b, v255 ; encoding: [0xff,0xfe,0xff,0x79,0x0b,0xfe,0x00,0x00] +0x01,0x05,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x42] -0x01,0x05,0x0a,0x42 +0xff,0x05,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x42] -0xff,0x05,0x0a,0x42 +0x01,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x42] -0x01,0x04,0x0a,0x42 +0x69,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x42] -0x69,0x04,0x0a,0x42 +0x6a,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x42] -0x6a,0x04,0x0a,0x42 +0x6b,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x42] -0x6b,0x04,0x0a,0x42 +0x7b,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x42] -0x7b,0x04,0x0a,0x42 +0x7d,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x42] -0x7d,0x04,0x0a,0x42 +0x7e,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x42] -0x7e,0x04,0x0a,0x42 +0x7f,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x42] -0x7f,0x04,0x0a,0x42 +0x7c,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x42] -0x7c,0x04,0x0a,0x42 +0xc1,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x42] -0xc1,0x04,0x0a,0x42 +0xf0,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x42] -0xf0,0x04,0x0a,0x42 +0xfd,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x42] -0xfd,0x04,0x0a,0x42 +0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf # W32: v_sub_co_ci_u32_e32 v255, vcc_lo, 0xaf123456, v255, vcc_lo ; encoding: [0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf] # W64: v_sub_co_ci_u32_e32 v255, vcc, 0xaf123456, v255, vcc ; encoding: [0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf] -0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf +0x01,0x05,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x66] -0x01,0x05,0x0a,0x66 +0x81,0x05,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x66] -0x81,0x05,0x0a,0x66 +0x7f,0x05,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x66] -0x7f,0x05,0x0a,0x66 +0xff,0x05,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x66] -0xff,0x05,0x0a,0x66 +0x01,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x66] -0x01,0x04,0x0a,0x66 +0x69,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x66] -0x69,0x04,0x0a,0x66 +0x6a,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x66] -0x6a,0x04,0x0a,0x66 +0x6b,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x66] -0x6b,0x04,0x0a,0x66 +0x7b,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x66] -0x7b,0x04,0x0a,0x66 +0x7d,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x66] -0x7d,0x04,0x0a,0x66 +0x7e,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x66] -0x7e,0x04,0x0a,0x66 +0x7f,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x66] -0x7f,0x04,0x0a,0x66 +0x7c,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x66] -0x7c,0x04,0x0a,0x66 +0xc1,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x66] -0xc1,0x04,0x0a,0x66 +0xf0,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x66] -0xf0,0x04,0x0a,0x66 +0xfd,0x04,0x0a,0x66 # GFX11-REAL16: v_sub_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x66] # GFX11-FAKE16: v_sub_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x66] -0xfd,0x04,0x0a,0x66 -# GFX11-REAL16: v_sub_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x67] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xfd,0x04,0x0b,0x67 0xfd,0x04,0x0b,0x67 +# GFX11-REAL16: v_sub_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x67] +0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00 # GFX11-REAL16: v_sub_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00] # GFX11-FAKE16: v_sub_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00] -0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00 -# GFX11-REAL16: v_sub_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x67,0x0b,0xfe,0x00,0x00] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xff,0xfe,0xff,0x67,0x0b,0xfe,0x00,0x00 0xff,0xfe,0xff,0x67,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_sub_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x67,0x0b,0xfe,0x00,0x00] -# GFX11: v_sub_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x08] 0x01,0x05,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x08] 0xff,0x05,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x08] 0x01,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x08] 0x69,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x08] 0x6a,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x08] 0x6b,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x08] 0x7b,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x08] 0x7d,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x08] 0x7e,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x08] 0x7f,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x08] 0x7c,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x08] 0xc1,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x08] 0xf0,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x08] 0xfd,0x04,0x0a,0x08 +# GFX11: v_sub_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x08] -# GFX11: v_sub_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x09,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x09,0x56,0x34,0x12,0xaf +# GFX11: v_sub_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x09,0x56,0x34,0x12,0xaf] -# GFX11: v_sub_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4c] 0x01,0x05,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4c] 0xff,0x05,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4c] 0x01,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4c] 0x69,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4c] 0x6a,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4c] 0x6b,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4c] 0x7b,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4c] 0x7d,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4c] 0x7e,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4c] 0x7f,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4c] 0x7c,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4c] 0xc1,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4c] 0xf0,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4c] 0xfd,0x04,0x0a,0x4c +# GFX11: v_sub_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4c] -# GFX11: v_sub_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4d,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x4d,0x56,0x34,0x12,0xaf +# GFX11: v_sub_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4d,0x56,0x34,0x12,0xaf] +0x01,0x05,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x44] -0x01,0x05,0x0a,0x44 +0xff,0x05,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x44] -0xff,0x05,0x0a,0x44 +0x01,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x44] -0x01,0x04,0x0a,0x44 +0x69,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x44] -0x69,0x04,0x0a,0x44 +0x6a,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x44] -0x6a,0x04,0x0a,0x44 +0x6b,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x44] -0x6b,0x04,0x0a,0x44 +0x7b,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x44] -0x7b,0x04,0x0a,0x44 +0x7d,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x44] -0x7d,0x04,0x0a,0x44 +0x7e,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x44] -0x7e,0x04,0x0a,0x44 +0x7f,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x44] -0x7f,0x04,0x0a,0x44 +0x7c,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x44] -0x7c,0x04,0x0a,0x44 +0xc1,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x44] -0xc1,0x04,0x0a,0x44 +0xf0,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x44] -0xf0,0x04,0x0a,0x44 +0xfd,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x44] -0xfd,0x04,0x0a,0x44 +0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf # W32: v_subrev_co_ci_u32_e32 v255, vcc_lo, 0xaf123456, v255, vcc_lo ; encoding: [0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf] # W64: v_subrev_co_ci_u32_e32 v255, vcc, 0xaf123456, v255, vcc ; encoding: [0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf] -0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf +0x01,0x05,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x68] -0x01,0x05,0x0a,0x68 +0x81,0x05,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, v1.h, v2.l ; encoding: [0x81,0x05,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x68] -0x81,0x05,0x0a,0x68 +0x7f,0x05,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x68] -0x7f,0x05,0x0a,0x68 +0xff,0x05,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, v127.h, v2.l ; encoding: [0xff,0x05,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x68] -0xff,0x05,0x0a,0x68 +0x01,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x68] -0x01,0x04,0x0a,0x68 +0x69,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x68] -0x69,0x04,0x0a,0x68 +0x6a,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x68] -0x6a,0x04,0x0a,0x68 +0x6b,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x68] -0x6b,0x04,0x0a,0x68 +0x7b,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x68] -0x7b,0x04,0x0a,0x68 +0x7d,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x68] -0x7d,0x04,0x0a,0x68 +0x7e,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x68] -0x7e,0x04,0x0a,0x68 +0x7f,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x68] -0x7f,0x04,0x0a,0x68 +0x7c,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x68] -0x7c,0x04,0x0a,0x68 +0xc1,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x68] -0xc1,0x04,0x0a,0x68 +0xf0,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x68] -0xf0,0x04,0x0a,0x68 +0xfd,0x04,0x0a,0x68 # GFX11-REAL16: v_subrev_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x68] # GFX11-FAKE16: v_subrev_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x68] -0xfd,0x04,0x0a,0x68 -# GFX11-REAL16: v_subrev_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x69] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xfd,0x04,0x0b,0x69 0xfd,0x04,0x0b,0x69 +# GFX11-REAL16: v_subrev_f16_e32 v5.h, src_scc, v2.h ; encoding: [0xfd,0x04,0x0b,0x69] +0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00 # GFX11-REAL16: v_subrev_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00] # GFX11-FAKE16: v_subrev_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00] -0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00 -# GFX11-REAL16: v_subrev_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x69,0x0b,0xfe,0x00,0x00] -# COM: TODO: GFX11-FAKE16: warning: invalid instruction encoding 0xff,0xfe,0xff,0x69,0x0b,0xfe,0x00,0x00 0xff,0xfe,0xff,0x69,0x0b,0xfe,0x00,0x00 +# GFX11-REAL16: v_subrev_f16_e32 v127.h, 0xfe0b, v127.h ; encoding: [0xff,0xfe,0xff,0x69,0x0b,0xfe,0x00,0x00] -# GFX11: v_subrev_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0a] 0x01,0x05,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0a] 0xff,0x05,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0a] 0x01,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0a] 0x69,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0a] 0x6a,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0a] 0x6b,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0a] 0x7b,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0a] 0x7d,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0a] 0x7e,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0a] 0x7f,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0a] 0x7c,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0a] 0xc1,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0a] 0xf0,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0a] 0xfd,0x04,0x0a,0x0a +# GFX11: v_subrev_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0a] -# GFX11: v_subrev_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x0b,0x56,0x34,0x12,0xaf +# GFX11: v_subrev_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0b,0x56,0x34,0x12,0xaf] -# GFX11: v_subrev_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4e] 0x01,0x05,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4e] 0xff,0x05,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4e] 0x01,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4e] 0x69,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4e] 0x6a,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4e] 0x6b,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4e] 0x7b,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4e] 0x7d,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4e] 0x7e,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4e] 0x7f,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4e] 0x7c,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4e] 0xc1,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4e] 0xf0,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4e] 0xfd,0x04,0x0a,0x4e +# GFX11: v_subrev_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4e] -# GFX11: v_subrev_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4f,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x4f,0x56,0x34,0x12,0xaf +# GFX11: v_subrev_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4f,0x56,0x34,0x12,0xaf] -# GFX11: v_xnor_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x3c] 0x01,0x05,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x3c] 0xff,0x05,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x3c] 0x01,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x3c] 0x69,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x3c] 0x6a,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x3c] 0x6b,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x3c] 0x7b,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x3c] 0x7d,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x3c] 0x7e,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x3c] 0x7f,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x3c] 0x7c,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x3c] 0xc1,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x3c] 0xf0,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x3c] 0xfd,0x04,0x0a,0x3c +# GFX11: v_xnor_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x3c] -# GFX11: v_xnor_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x3d,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x3d,0x56,0x34,0x12,0xaf +# GFX11: v_xnor_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x3d,0x56,0x34,0x12,0xaf] -# GFX11: v_xor_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x3a] 0x01,0x05,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x3a] 0xff,0x05,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x3a] 0x01,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x3a] 0x69,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x3a] 0x6a,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x3a] 0x6b,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x3a] 0x7b,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x3a] 0x7d,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x3a] 0x7e,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x3a] 0x7f,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x3a] 0x7c,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x3a] 0xc1,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x3a] 0xf0,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x3a] 0xfd,0x04,0x0a,0x3a +# GFX11: v_xor_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x3a] -# GFX11: v_xor_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x3b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x3b,0x56,0x34,0x12,0xaf +# GFX11: v_xor_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x3b,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt index eebf0cc13cee6..a8a40f883cc48 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp16.txt @@ -1,1750 +1,1851 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s +; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32,GFX11-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64,GFX11-FAKE16 %s +0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff +0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff] -0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff +0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01] -0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01 +0xfa,0x04,0x0a,0x40,0x01,0x60,0x01,0x13 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x60,0x01,0x13] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x60,0x01,0x13] -0xfa,0x04,0x0a,0x40,0x01,0x60,0x01,0x13 +0xfa,0xfe,0xff,0x41,0xff,0x6f,0x0d,0x30 # W32: v_add_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x41,0xff,0x6f,0x0d,0x30] # W64: v_add_co_ci_u32_dpp v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x41,0xff,0x6f,0x0d,0x30] -0xfa,0xfe,0xff,0x41,0xff,0x6f,0x0d,0x30 -# GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff] -# GFX11: v_add_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01] -# GFX11: v_add_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x64,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x60,0x01,0x13] -# GFX11: v_add_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xfd,0x30 +# GFX11-REAL16: v_add_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_add_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xfd,0x30] -# GFX11: v_add_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x50,0x01,0xff +# GFX11: v_add_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x50,0x01,0xff] -# GFX11: v_add_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x06,0x01,0x5f,0x01,0x01 +# GFX11: v_add_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x5f,0x01,0x01] -# GFX11: v_add_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x06,0x01,0x60,0x01,0x13 +# GFX11: v_add_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x60,0x01,0x13] -# GFX11: v_add_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x07,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x07,0xff,0x6f,0xfd,0x30 +# GFX11: v_add_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x07,0xff,0x6f,0xfd,0x30] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x1b,0x00,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1b,0x00,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x40,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x40,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x41,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x41,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x01,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x01,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x0f,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x0f,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x11,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x11,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x1f,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1f,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x21,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x21,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x2f,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x2f,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x50,0x01,0xff +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x50,0x01,0xff] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x4a,0x01,0x5f,0x01,0x01 +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x5f,0x01,0x01] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x4a,0x01,0x60,0x01,0x13 +# GFX11: v_add_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x60,0x01,0x13] -# GFX11: v_add_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4b,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x4b,0xff,0x6f,0x0d,0x30 +# GFX11: v_add_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4b,0xff,0x6f,0x0d,0x30] -# GFX11: v_and_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x1b,0x00,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1b,0x00,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x36,0x01,0xe4,0x00,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0xe4,0x00,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x40,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x40,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x41,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x41,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x01,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x01,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x0f,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x0f,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x11,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x11,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x1f,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1f,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x21,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x21,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x2f,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x2f,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x50,0x01,0xff +# GFX11: v_and_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x50,0x01,0xff] -# GFX11: v_and_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x36,0x01,0x5f,0x01,0x01 +# GFX11: v_and_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x5f,0x01,0x01] -# GFX11: v_and_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x36,0x01,0x60,0x01,0x13 +# GFX11: v_and_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x60,0x01,0x13] -# GFX11: v_and_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x37,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x37,0xff,0x6f,0x0d,0x30 +# GFX11: v_and_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x37,0xff,0x6f,0x0d,0x30] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x1b,0x00,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1b,0x00,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x34,0x01,0xe4,0x00,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0xe4,0x00,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x40,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x40,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x41,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x41,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x01,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x01,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x0f,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x0f,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x11,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x11,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x1f,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1f,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x21,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x21,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x2f,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x2f,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x50,0x01,0xff +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x50,0x01,0xff] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x34,0x01,0x5f,0x01,0x01 +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x5f,0x01,0x01] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x34,0x01,0x60,0x01,0x13 +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x60,0x01,0x13] -# GFX11: v_ashrrev_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x35,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x35,0xff,0x6f,0x0d,0x30 +# GFX11: v_ashrrev_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x35,0xff,0x6f,0x0d,0x30] +0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff +0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff] -0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff +0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01 # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01] -0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01 +0xfa,0x04,0x0a,0x02,0x01,0x60,0x01,0x13 # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x60,0x01,0x13] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x60,0x01,0x13] -0xfa,0x04,0x0a,0x02,0x01,0x60,0x01,0x13 +0xfa,0xfe,0xff,0x03,0xff,0x6f,0x0d,0x30 # W32: v_cndmask_b32_dpp v255, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x03,0xff,0x6f,0x0d,0x30] # W64: v_cndmask_b32_dpp v255, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x03,0xff,0x6f,0x0d,0x30] -0xfa,0xfe,0xff,0x03,0xff,0x6f,0x0d,0x30 +0xfa,0x04,0x0a,0x02,0x01,0xe4,0x90,0xff # W32: v_cndmask_b32_dpp v5, -v1, |v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x90,0xff] # W64: v_cndmask_b32_dpp v5, -v1, |v2|, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x90,0xff] -0xfa,0x04,0x0a,0x02,0x01,0xe4,0x90,0xff +0xfa,0x04,0x0a,0x02,0x01,0xe4,0x60,0xff # W32: v_cndmask_b32_dpp v5, |v1|, -v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x60,0xff] # W64: v_cndmask_b32_dpp v5, |v1|, -v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x60,0xff] -0xfa,0x04,0x0a,0x02,0x01,0xe4,0x60,0xff +0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff # W32: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff] # W64: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff] -0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01 +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x5e,0x01,0x60,0x01,0x13 +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x60,0x01,0x13] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xfd,0x30 +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xfd,0x30] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x1b,0x00,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x1b,0x00,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x40,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x40,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x41,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x41,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x01,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x01,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x0f,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x0f,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x11,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x11,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x1f,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x1f,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x21,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x21,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x2f,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x2f,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x04,0x01,0x50,0x01,0xff +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x50,0x01,0xff] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x04,0x01,0x5f,0x01,0x01 +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x5f,0x01,0x01] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x04,0x01,0x60,0x01,0x13 +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0x60,0x01,0x13] -# GFX11: v_dot2acc_f32_f16_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x05,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x05,0xff,0x6f,0xfd,0x30 +# GFX11: v_dot2acc_f32_f16_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x05,0xff,0x6f,0xfd,0x30] -# GFX11: v_fmac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01 +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01] -# GFX11: v_fmac_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13 +# GFX11: v_fmac_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13] -# GFX11: v_fmac_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30 +# GFX11: v_fmac_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30] -# GFX11: v_fmac_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x50,0x01,0xff +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x50,0x01,0xff] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x56,0x01,0x5f,0x01,0x01 +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x5f,0x01,0x01] -# GFX11: v_fmac_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x56,0x01,0x60,0x01,0x13 +# GFX11: v_fmac_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x60,0x01,0x13] -# GFX11: v_fmac_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x57,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x57,0xff,0x6f,0xfd,0x30 +# GFX11: v_fmac_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x57,0xff,0x6f,0xfd,0x30] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x76,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x60,0x01,0x13] -# GFX11: v_ldexp_f16_dpp v127, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x3d,0x30] 0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x3d,0x30 +# GFX11-REAL16: v_ldexp_f16_dpp v127.l, -|v127.l|, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x3d,0x30] +# GFX11-FAKE16: v_ldexp_f16_dpp v127, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x3d,0x30] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x1b,0x00,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1b,0x00,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x30,0x01,0xe4,0x00,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0xe4,0x00,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x40,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x40,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x41,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x41,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x01,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x01,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x0f,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x0f,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x11,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x11,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x1f,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1f,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x21,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x21,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x2f,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x2f,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x50,0x01,0xff +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x50,0x01,0xff] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x30,0x01,0x5f,0x01,0x01 +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x5f,0x01,0x01] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x30,0x01,0x60,0x01,0x13 +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x60,0x01,0x13] -# GFX11: v_lshlrev_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x31,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x31,0xff,0x6f,0x0d,0x30 +# GFX11: v_lshlrev_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x31,0xff,0x6f,0x0d,0x30] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x1b,0x00,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1b,0x00,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x32,0x01,0xe4,0x00,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0xe4,0x00,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x40,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x40,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x41,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x41,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x01,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x01,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x0f,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x0f,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x11,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x11,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x1f,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1f,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x21,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x21,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x2f,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x2f,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x50,0x01,0xff +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x50,0x01,0xff] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x32,0x01,0x5f,0x01,0x01 +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x5f,0x01,0x01] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x32,0x01,0x60,0x01,0x13 +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x60,0x01,0x13] -# GFX11: v_lshrrev_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x33,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x33,0xff,0x6f,0x0d,0x30 +# GFX11: v_lshrrev_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x33,0xff,0x6f,0x0d,0x30] -# GFX11: v_max_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x1b,0x00,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x1b,0x00,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x72,0x01,0xe4,0x00,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0xe4,0x00,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x40,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x40,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x41,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x41,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x01,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x01,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x0f,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x0f,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x11,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x11,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x1f,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x1f,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x21,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x21,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x2f,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x2f,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x72,0x01,0x50,0x01,0xff +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x50,0x01,0xff] -# GFX11: v_max_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x72,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x5f,0x01,0x01] -# GFX11: v_max_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x72,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x72,0x01,0x60,0x01,0x13] -# GFX11: v_max_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x72,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x72,0x7f,0x6f,0xfd,0x30 +# GFX11-REAL16: v_max_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x72,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_max_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x72,0x7f,0x6f,0xfd,0x30] -# GFX11: v_max_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x1b,0x00,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x1b,0x00,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x40,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x40,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x41,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x41,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x01,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x01,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x0f,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x0f,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x11,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x11,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x1f,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x1f,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x21,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x21,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x2f,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x2f,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x20,0x01,0x50,0x01,0xff +# GFX11: v_max_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x50,0x01,0xff] -# GFX11: v_max_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x20,0x01,0x5f,0x01,0x01 +# GFX11: v_max_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x5f,0x01,0x01] -# GFX11: v_max_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x20,0x01,0x60,0x01,0x13 +# GFX11: v_max_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x20,0x01,0x60,0x01,0x13] -# GFX11: v_max_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x21,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x21,0xff,0x6f,0xfd,0x30 +# GFX11: v_max_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x21,0xff,0x6f,0xfd,0x30] -# GFX11: v_max_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x50,0x01,0xff +# GFX11: v_max_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x50,0x01,0xff] -# GFX11: v_max_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x24,0x01,0x5f,0x01,0x01 +# GFX11: v_max_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x5f,0x01,0x01] -# GFX11: v_max_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x24,0x01,0x60,0x01,0x13 +# GFX11: v_max_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x60,0x01,0x13] -# GFX11: v_max_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x25,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x25,0xff,0x6f,0x0d,0x30 +# GFX11: v_max_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x25,0xff,0x6f,0x0d,0x30] -# GFX11: v_max_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x50,0x01,0xff +# GFX11: v_max_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x50,0x01,0xff] -# GFX11: v_max_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x28,0x01,0x5f,0x01,0x01 +# GFX11: v_max_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x5f,0x01,0x01] -# GFX11: v_max_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x28,0x01,0x60,0x01,0x13 +# GFX11: v_max_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x60,0x01,0x13] -# GFX11: v_max_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x29,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x29,0xff,0x6f,0x0d,0x30 +# GFX11: v_max_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x29,0xff,0x6f,0x0d,0x30] -# GFX11: v_min_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x1b,0x00,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x1b,0x00,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x74,0x01,0xe4,0x00,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0xe4,0x00,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x40,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x40,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x41,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x41,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x01,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x01,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x0f,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x0f,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x11,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x11,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x1f,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x1f,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x21,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x21,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x2f,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x2f,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x74,0x01,0x50,0x01,0xff +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x50,0x01,0xff] -# GFX11: v_min_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x74,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x5f,0x01,0x01] -# GFX11: v_min_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x74,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x74,0x01,0x60,0x01,0x13] -# GFX11: v_min_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x74,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x74,0x7f,0x6f,0xfd,0x30 +# GFX11-REAL16: v_min_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x74,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_min_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x74,0x7f,0x6f,0xfd,0x30] -# GFX11: v_min_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x1b,0x00,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x1b,0x00,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x40,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x40,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x41,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x41,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x01,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x01,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x0f,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x0f,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x11,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x11,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x1f,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x1f,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x21,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x21,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x2f,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x2f,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x1e,0x01,0x50,0x01,0xff +# GFX11: v_min_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x50,0x01,0xff] -# GFX11: v_min_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x1e,0x01,0x5f,0x01,0x01 +# GFX11: v_min_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x5f,0x01,0x01] -# GFX11: v_min_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x1e,0x01,0x60,0x01,0x13 +# GFX11: v_min_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x1e,0x01,0x60,0x01,0x13] -# GFX11: v_min_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x1f,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x1f,0xff,0x6f,0xfd,0x30 +# GFX11: v_min_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x1f,0xff,0x6f,0xfd,0x30] -# GFX11: v_min_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x50,0x01,0xff +# GFX11: v_min_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x50,0x01,0xff] -# GFX11: v_min_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x22,0x01,0x5f,0x01,0x01 +# GFX11: v_min_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x5f,0x01,0x01] -# GFX11: v_min_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x22,0x01,0x60,0x01,0x13 +# GFX11: v_min_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x60,0x01,0x13] -# GFX11: v_min_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x23,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x23,0xff,0x6f,0x0d,0x30 +# GFX11: v_min_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x23,0xff,0x6f,0x0d,0x30] -# GFX11: v_min_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x50,0x01,0xff +# GFX11: v_min_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x50,0x01,0xff] -# GFX11: v_min_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x26,0x01,0x5f,0x01,0x01 +# GFX11: v_min_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x5f,0x01,0x01] -# GFX11: v_min_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x26,0x01,0x60,0x01,0x13 +# GFX11: v_min_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x60,0x01,0x13] -# GFX11: v_min_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x27,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x27,0xff,0x6f,0x0d,0x30 +# GFX11: v_min_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x27,0xff,0x6f,0x0d,0x30] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01 +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x0e,0x01,0x60,0x01,0x13 +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x60,0x01,0x13] -# GFX11: v_mul_dx9_zero_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xfd,0x30 +# GFX11: v_mul_dx9_zero_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xfd,0x30] -# GFX11: v_mul_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff -# GFX11: v_mul_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01] -# GFX11: v_mul_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x6a,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x60,0x01,0x13] -# GFX11: v_mul_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xfd,0x30 +# GFX11-REAL16: v_mul_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_mul_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xfd,0x30] -# GFX11: v_mul_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x50,0x01,0xff +# GFX11: v_mul_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x50,0x01,0xff] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x10,0x01,0x5f,0x01,0x01 +# GFX11: v_mul_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x5f,0x01,0x01] -# GFX11: v_mul_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x10,0x01,0x60,0x01,0x13 +# GFX11: v_mul_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x60,0x01,0x13] -# GFX11: v_mul_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x11,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x11,0xff,0x6f,0xfd,0x30 +# GFX11: v_mul_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x11,0xff,0x6f,0xfd,0x30] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x50,0x01,0xff +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x50,0x01,0xff] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x14,0x01,0x5f,0x01,0x01 +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x5f,0x01,0x01] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x14,0x01,0x60,0x01,0x13 +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x60,0x01,0x13] -# GFX11: v_mul_hi_i32_i24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x15,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x15,0xff,0x6f,0x0d,0x30 +# GFX11: v_mul_hi_i32_i24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x15,0xff,0x6f,0x0d,0x30] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x50,0x01,0xff +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x50,0x01,0xff] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x18,0x01,0x5f,0x01,0x01 +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x5f,0x01,0x01] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x18,0x01,0x60,0x01,0x13 +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x60,0x01,0x13] -# GFX11: v_mul_hi_u32_u24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x19,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x19,0xff,0x6f,0x0d,0x30 +# GFX11: v_mul_hi_u32_u24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x19,0xff,0x6f,0x0d,0x30] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x50,0x01,0xff +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x50,0x01,0xff] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x12,0x01,0x5f,0x01,0x01 +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x5f,0x01,0x01] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x12,0x01,0x60,0x01,0x13 +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x60,0x01,0x13] -# GFX11: v_mul_i32_i24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x13,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x13,0xff,0x6f,0x0d,0x30 +# GFX11: v_mul_i32_i24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x13,0xff,0x6f,0x0d,0x30] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x50,0x01,0xff +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x50,0x01,0xff] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x16,0x01,0x5f,0x01,0x01 +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x5f,0x01,0x01] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x16,0x01,0x60,0x01,0x13 +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x60,0x01,0x13] -# GFX11: v_mul_u32_u24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x17,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x17,0xff,0x6f,0x0d,0x30 +# GFX11: v_mul_u32_u24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x17,0xff,0x6f,0x0d,0x30] -# GFX11: v_or_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x50,0x01,0xff +# GFX11: v_or_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x50,0x01,0xff] -# GFX11: v_or_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x38,0x01,0x5f,0x01,0x01 +# GFX11: v_or_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x5f,0x01,0x01] -# GFX11: v_or_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x38,0x01,0x60,0x01,0x13 +# GFX11: v_or_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x60,0x01,0x13] -# GFX11: v_or_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x39,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x39,0xff,0x6f,0x0d,0x30 +# GFX11: v_or_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x39,0xff,0x6f,0x0d,0x30] +0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff +0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff] -0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff +0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01 # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01] -0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01 +0xfa,0x04,0x0a,0x42,0x01,0x60,0x01,0x13 # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x60,0x01,0x13] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x60,0x01,0x13] -0xfa,0x04,0x0a,0x42,0x01,0x60,0x01,0x13 +0xfa,0xfe,0xff,0x43,0xff,0x6f,0x0d,0x30 # W32: v_sub_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x43,0xff,0x6f,0x0d,0x30] # W64: v_sub_co_ci_u32_dpp v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x43,0xff,0x6f,0x0d,0x30] -0xfa,0xfe,0xff,0x43,0xff,0x6f,0x0d,0x30 -# GFX11: v_sub_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01] -# GFX11: v_sub_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x66,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x60,0x01,0x13] -# GFX11: v_sub_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xfd,0x30 +# GFX11-REAL16: v_sub_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_sub_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xfd,0x30] -# GFX11: v_sub_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x50,0x01,0xff +# GFX11: v_sub_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x50,0x01,0xff] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x08,0x01,0x5f,0x01,0x01 +# GFX11: v_sub_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x5f,0x01,0x01] -# GFX11: v_sub_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x08,0x01,0x60,0x01,0x13 +# GFX11: v_sub_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x60,0x01,0x13] -# GFX11: v_sub_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x09,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x09,0xff,0x6f,0xfd,0x30 +# GFX11: v_sub_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x09,0xff,0x6f,0xfd,0x30] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x50,0x01,0xff +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x50,0x01,0xff] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x4c,0x01,0x5f,0x01,0x01 +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x5f,0x01,0x01] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x4c,0x01,0x60,0x01,0x13 +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x60,0x01,0x13] -# GFX11: v_sub_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4d,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x4d,0xff,0x6f,0x0d,0x30 +# GFX11: v_sub_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4d,0xff,0x6f,0x0d,0x30] +0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff +0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff] -0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff +0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01 # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01] -0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01 +0xfa,0x04,0x0a,0x44,0x01,0x60,0x01,0x13 # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x60,0x01,0x13] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x60,0x01,0x13] -0xfa,0x04,0x0a,0x44,0x01,0x60,0x01,0x13 +0xfa,0xfe,0xff,0x45,0xff,0x6f,0x0d,0x30 # W32: v_subrev_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x45,0xff,0x6f,0x0d,0x30] # W64: v_subrev_co_ci_u32_dpp v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x45,0xff,0x6f,0x0d,0x30] -0xfa,0xfe,0xff,0x45,0xff,0x6f,0x0d,0x30 -# GFX11: v_subrev_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01 +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01] -# GFX11: v_subrev_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x68,0x01,0x60,0x01,0x13 +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x60,0x01,0x13] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x60,0x01,0x13] -# GFX11: v_subrev_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xfd,0x30 +# GFX11-REAL16: v_subrev_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xfd,0x30] +# GFX11-FAKE16: v_subrev_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xfd,0x30] -# GFX11: v_subrev_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x50,0x01,0xff +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x50,0x01,0xff] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x0a,0x01,0x5f,0x01,0x01 +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x5f,0x01,0x01] -# GFX11: v_subrev_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x0a,0x01,0x60,0x01,0x13 +# GFX11: v_subrev_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x60,0x01,0x13] -# GFX11: v_subrev_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x0b,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x0b,0xff,0x6f,0xfd,0x30 +# GFX11: v_subrev_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x0b,0xff,0x6f,0xfd,0x30] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x50,0x01,0xff +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x50,0x01,0xff] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x4e,0x01,0x5f,0x01,0x01 +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x5f,0x01,0x01] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x4e,0x01,0x60,0x01,0x13 +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x60,0x01,0x13] -# GFX11: v_subrev_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4f,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x4f,0xff,0x6f,0x0d,0x30 +# GFX11: v_subrev_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4f,0xff,0x6f,0x0d,0x30] -# GFX11: v_xnor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x50,0x01,0xff +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x50,0x01,0xff] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x3c,0x01,0x5f,0x01,0x01 +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x5f,0x01,0x01] -# GFX11: v_xnor_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x3c,0x01,0x60,0x01,0x13 +# GFX11: v_xnor_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x60,0x01,0x13] -# GFX11: v_xnor_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x3d,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x3d,0xff,0x6f,0x0d,0x30 +# GFX11: v_xnor_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x3d,0xff,0x6f,0x0d,0x30] -# GFX11: v_xor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x50,0x01,0xff +# GFX11: v_xor_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x50,0x01,0xff] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x3a,0x01,0x5f,0x01,0x01 +# GFX11: v_xor_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x5f,0x01,0x01] -# GFX11: v_xor_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x3a,0x01,0x60,0x01,0x13 +# GFX11: v_xor_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x60,0x01,0x13] -# GFX11: v_xor_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x3b,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x3b,0xff,0x6f,0x0d,0x30 +# GFX11: v_xor_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x3b,0xff,0x6f,0x0d,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt index 5f1d4d4b33cbd..a1d2c34f09f2b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2_dpp8.txt @@ -1,250 +1,267 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64 %s +; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64,GFX11-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W32,GFX11-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,W64,GFX11-FAKE16 %s +0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] -0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05 +0xea,0xfe,0xff,0x41,0xff,0x00,0x00,0x00 # W32: v_add_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x41,0xff,0x00,0x00,0x00] # W64: v_add_co_ci_u32_dpp v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x41,0xff,0x00,0x00,0x00] -0xea,0xfe,0xff,0x41,0xff,0x00,0x00,0x00 -# GFX11: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] -# GFX11: v_add_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00 +# GFX11-REAL16: v_add_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_add_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00] -# GFX11: v_add_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05 +# GFX11: v_add_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] -# GFX11: v_add_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x07,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x07,0xff,0x00,0x00,0x00 +# GFX11: v_add_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x07,0xff,0x00,0x00,0x00] -# GFX11: v_add_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05 +# GFX11: v_add_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05] -# GFX11: v_add_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4b,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x4b,0xff,0x00,0x00,0x00 +# GFX11: v_add_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4b,0xff,0x00,0x00,0x00] -# GFX11: v_and_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x36,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x36,0x01,0x77,0x39,0x05 +# GFX11: v_and_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x36,0x01,0x77,0x39,0x05] -# GFX11: v_and_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x37,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x37,0xff,0x00,0x00,0x00 +# GFX11: v_and_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x37,0xff,0x00,0x00,0x00] -# GFX11: v_ashrrev_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x34,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x34,0x01,0x77,0x39,0x05 +# GFX11: v_ashrrev_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x34,0x01,0x77,0x39,0x05] -# GFX11: v_ashrrev_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x35,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x35,0xff,0x00,0x00,0x00 +# GFX11: v_ashrrev_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x35,0xff,0x00,0x00,0x00] +0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05 # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] -0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05 +0xea,0xfe,0xff,0x03,0xff,0x00,0x00,0x00 # W32: v_cndmask_b32_dpp v255, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x03,0xff,0x00,0x00,0x00] # W64: v_cndmask_b32_dpp v255, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x03,0xff,0x00,0x00,0x00] -0xea,0xfe,0xff,0x03,0xff,0x00,0x00,0x00 -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05 +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] -# GFX11: v_cvt_pk_rtz_f16_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00 +# GFX11: v_cvt_pk_rtz_f16_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00] -# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05 +# GFX11: v_dot2acc_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x04,0x01,0x77,0x39,0x05] -# GFX11: v_dot2acc_f32_f16_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x05,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x05,0xff,0x00,0x00,0x00 +# GFX11: v_dot2acc_f32_f16_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x05,0xff,0x00,0x00,0x00] -# GFX11: v_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05 +# GFX11: v_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05] -# GFX11: v_fmac_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00 +# GFX11: v_fmac_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00] -# GFX11: v_fmac_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05 +# GFX11: v_fmac_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05] -# GFX11: v_fmac_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x57,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x57,0xff,0x00,0x00,0x00 +# GFX11: v_fmac_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x57,0xff,0x00,0x00,0x00] -# GFX11: v_ldexp_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_ldexp_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] -# GFX11: v_ldexp_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00 +# GFX11-REAL16: v_ldexp_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_ldexp_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00] -# GFX11: v_lshlrev_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x30,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x30,0x01,0x77,0x39,0x05 +# GFX11: v_lshlrev_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x30,0x01,0x77,0x39,0x05] -# GFX11: v_lshlrev_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x31,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x31,0xff,0x00,0x00,0x00 +# GFX11: v_lshlrev_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x31,0xff,0x00,0x00,0x00] -# GFX11: v_lshrrev_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x32,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x32,0x01,0x77,0x39,0x05 +# GFX11: v_lshrrev_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x32,0x01,0x77,0x39,0x05] -# GFX11: v_lshrrev_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x33,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x33,0xff,0x00,0x00,0x00 +# GFX11: v_lshrrev_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x33,0xff,0x00,0x00,0x00] -# GFX11: v_max_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x72,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x72,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_max_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x72,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_max_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x72,0x01,0x77,0x39,0x05] -# GFX11: v_max_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x72,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x72,0x7f,0x00,0x00,0x00 +# GFX11-REAL16: v_max_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x72,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_max_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x72,0x7f,0x00,0x00,0x00] -# GFX11: v_max_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x20,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x20,0x01,0x77,0x39,0x05 +# GFX11: v_max_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x20,0x01,0x77,0x39,0x05] -# GFX11: v_max_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x21,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x21,0xff,0x00,0x00,0x00 +# GFX11: v_max_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x21,0xff,0x00,0x00,0x00] -# GFX11: v_max_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x24,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x24,0x01,0x77,0x39,0x05 +# GFX11: v_max_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x24,0x01,0x77,0x39,0x05] -# GFX11: v_max_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x25,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x25,0xff,0x00,0x00,0x00 +# GFX11: v_max_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x25,0xff,0x00,0x00,0x00] -# GFX11: v_max_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x28,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x28,0x01,0x77,0x39,0x05 +# GFX11: v_max_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x28,0x01,0x77,0x39,0x05] -# GFX11: v_max_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x29,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x29,0xff,0x00,0x00,0x00 +# GFX11: v_max_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x29,0xff,0x00,0x00,0x00] -# GFX11: v_min_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x74,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x74,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_min_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x74,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_min_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x74,0x01,0x77,0x39,0x05] -# GFX11: v_min_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x74,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x74,0x7f,0x00,0x00,0x00 +# GFX11-REAL16: v_min_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x74,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_min_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x74,0x7f,0x00,0x00,0x00] -# GFX11: v_min_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x1e,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x1e,0x01,0x77,0x39,0x05 +# GFX11: v_min_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x1e,0x01,0x77,0x39,0x05] -# GFX11: v_min_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x1f,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x1f,0xff,0x00,0x00,0x00 +# GFX11: v_min_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x1f,0xff,0x00,0x00,0x00] -# GFX11: v_min_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x22,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x22,0x01,0x77,0x39,0x05 +# GFX11: v_min_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x22,0x01,0x77,0x39,0x05] -# GFX11: v_min_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x23,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x23,0xff,0x00,0x00,0x00 +# GFX11: v_min_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x23,0xff,0x00,0x00,0x00] -# GFX11: v_min_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x26,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x26,0x01,0x77,0x39,0x05 +# GFX11: v_min_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x26,0x01,0x77,0x39,0x05] -# GFX11: v_min_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x27,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x27,0xff,0x00,0x00,0x00 +# GFX11: v_min_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x27,0xff,0x00,0x00,0x00] -# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05 +# GFX11: v_mul_dx9_zero_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] -# GFX11: v_mul_dx9_zero_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00 +# GFX11: v_mul_dx9_zero_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00] -# GFX11: v_mul_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_mul_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] -# GFX11: v_mul_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00 +# GFX11-REAL16: v_mul_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_mul_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00] -# GFX11: v_mul_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x10,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x10,0x01,0x77,0x39,0x05 +# GFX11: v_mul_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x10,0x01,0x77,0x39,0x05] -# GFX11: v_mul_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x11,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x11,0xff,0x00,0x00,0x00 +# GFX11: v_mul_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x11,0xff,0x00,0x00,0x00] -# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x14,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x14,0x01,0x77,0x39,0x05 +# GFX11: v_mul_hi_i32_i24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x14,0x01,0x77,0x39,0x05] -# GFX11: v_mul_hi_i32_i24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x15,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x15,0xff,0x00,0x00,0x00 +# GFX11: v_mul_hi_i32_i24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x15,0xff,0x00,0x00,0x00] -# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x18,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x18,0x01,0x77,0x39,0x05 +# GFX11: v_mul_hi_u32_u24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x18,0x01,0x77,0x39,0x05] -# GFX11: v_mul_hi_u32_u24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x19,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x19,0xff,0x00,0x00,0x00 +# GFX11: v_mul_hi_u32_u24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x19,0xff,0x00,0x00,0x00] -# GFX11: v_mul_i32_i24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x12,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x12,0x01,0x77,0x39,0x05 +# GFX11: v_mul_i32_i24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x12,0x01,0x77,0x39,0x05] -# GFX11: v_mul_i32_i24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x13,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x13,0xff,0x00,0x00,0x00 +# GFX11: v_mul_i32_i24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x13,0xff,0x00,0x00,0x00] -# GFX11: v_mul_u32_u24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x16,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x16,0x01,0x77,0x39,0x05 +# GFX11: v_mul_u32_u24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x16,0x01,0x77,0x39,0x05] -# GFX11: v_mul_u32_u24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x17,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x17,0xff,0x00,0x00,0x00 +# GFX11: v_mul_u32_u24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x17,0xff,0x00,0x00,0x00] -# GFX11: v_or_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x38,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x38,0x01,0x77,0x39,0x05 +# GFX11: v_or_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x38,0x01,0x77,0x39,0x05] -# GFX11: v_or_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x39,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x39,0xff,0x00,0x00,0x00 +# GFX11: v_or_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x39,0xff,0x00,0x00,0x00] +0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05 # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] -0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05 +0xea,0xfe,0xff,0x43,0xff,0x00,0x00,0x00 # W32: v_sub_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x43,0xff,0x00,0x00,0x00] # W64: v_sub_co_ci_u32_dpp v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x43,0xff,0x00,0x00,0x00] -0xea,0xfe,0xff,0x43,0xff,0x00,0x00,0x00 -# GFX11: v_sub_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_sub_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] -# GFX11: v_sub_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00 +# GFX11-REAL16: v_sub_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_sub_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00] -# GFX11: v_sub_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x08,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x08,0x01,0x77,0x39,0x05 +# GFX11: v_sub_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x08,0x01,0x77,0x39,0x05] -# GFX11: v_sub_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x09,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x09,0xff,0x00,0x00,0x00 +# GFX11: v_sub_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x09,0xff,0x00,0x00,0x00] -# GFX11: v_sub_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05 +# GFX11: v_sub_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05] -# GFX11: v_sub_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4d,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x4d,0xff,0x00,0x00,0x00 +# GFX11: v_sub_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4d,0xff,0x00,0x00,0x00] +0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05 # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] -0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05 +0xea,0xfe,0xff,0x45,0xff,0x00,0x00,0x00 # W32: v_subrev_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x45,0xff,0x00,0x00,0x00] # W64: v_subrev_co_ci_u32_dpp v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x45,0xff,0x00,0x00,0x00] -0xea,0xfe,0xff,0x45,0xff,0x00,0x00,0x00 -# GFX11: v_subrev_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05 +# GFX11-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] +# GFX11-FAKE16: v_subrev_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] -# GFX11: v_subrev_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00 +# GFX11-REAL16: v_subrev_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00] +# GFX11-FAKE16: v_subrev_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00] -# GFX11: v_subrev_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05 +# GFX11: v_subrev_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05] -# GFX11: v_subrev_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x0b,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x0b,0xff,0x00,0x00,0x00 +# GFX11: v_subrev_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x0b,0xff,0x00,0x00,0x00] -# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05 +# GFX11: v_subrev_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] -# GFX11: v_subrev_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4f,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x4f,0xff,0x00,0x00,0x00 +# GFX11: v_subrev_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4f,0xff,0x00,0x00,0x00] -# GFX11: v_xnor_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05 +# GFX11: v_xnor_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05] -# GFX11: v_xnor_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x3d,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x3d,0xff,0x00,0x00,0x00 +# GFX11: v_xnor_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x3d,0xff,0x00,0x00,0x00] -# GFX11: v_xor_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05 +# GFX11: v_xor_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05] -# GFX11: v_xor_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x3b,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x3b,0xff,0x00,0x00,0x00 +# GFX11: v_xor_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x3b,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt index 673db0664fc6a..1276d898160b3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2.txt @@ -1,2228 +1,2336 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,GFX12-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,GFX12-FAKE16 %s +0x01,0x05,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x40] -0x01,0x05,0x0a,0x40 +0xff,0x05,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x40] -0xff,0x05,0x0a,0x40 +0x01,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x40] -0x01,0x04,0x0a,0x40 +0x69,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x40] -0x69,0x04,0x0a,0x40 +0x6a,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x40] -0x6a,0x04,0x0a,0x40 +0x6b,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x40] -0x6b,0x04,0x0a,0x40 +0x7b,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x40] -0x7b,0x04,0x0a,0x40 +0x7d,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x40] -0x7d,0x04,0x0a,0x40 +0x7e,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x40] -0x7e,0x04,0x0a,0x40 +0x7f,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x40] -0x7f,0x04,0x0a,0x40 +0x7c,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x40] -0x7c,0x04,0x0a,0x40 +0xc1,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x40] -0xc1,0x04,0x0a,0x40 +0xf0,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x40] -0xf0,0x04,0x0a,0x40 +0xfd,0x04,0x0a,0x40 # W32: v_add_co_ci_u32_e32 v5, vcc_lo, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x40] # W64: v_add_co_ci_u32_e32 v5, vcc, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x40] -0xfd,0x04,0x0a,0x40 +0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf # W32: v_add_co_ci_u32_e32 v255, vcc_lo, 0xaf123456, v255, vcc_lo ; encoding: [0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf] # W64: v_add_co_ci_u32_e32 v255, vcc, 0xaf123456, v255, vcc ; encoding: [0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf] -0xff,0xfe,0xff,0x41,0x56,0x34,0x12,0xaf -# GFX12: v_add_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x64] 0x01,0x05,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x64] 0x7f,0x05,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x64] 0x01,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x64] 0x69,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x64] 0x6a,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x64] 0x6b,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x64] 0x7b,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x64] 0x7d,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x64] 0x7e,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x64] 0x7f,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x64] 0x7c,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x64] 0xc1,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x64] 0xf0,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x64] 0xfd,0x04,0x0a,0x64 +# GFX12-REAL16: v_add_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x64] +# GFX12-FAKE16: v_add_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x64] -# GFX12: v_add_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_add_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_add_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x64,0x0b,0xfe,0x00,0x00] -# GFX12: v_add_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x06] 0x01,0x05,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x06] 0xff,0x05,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x06] 0x01,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x06] 0x69,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x06] 0x6a,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x06] 0x6b,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x06] 0x7b,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x06] 0x7d,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x06] 0x7e,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x06] 0x7f,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x06] 0x7c,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x06] 0xc1,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x06] 0xf0,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x06] 0xfd,0x04,0x0a,0x06 +# GFX12: v_add_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x06] -# GFX12: v_add_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x07,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x07,0x56,0x34,0x12,0xaf +# GFX12: v_add_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x07,0x56,0x34,0x12,0xaf] -# GFX12: v_add_f64_e32 v[5:6], v[1:2], v[3:4] ; encoding: [0x01,0x07,0x0a,0x04] 0x01,0x07,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], v[1:2], v[3:4] ; encoding: [0x01,0x07,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x0a,0x04] 0xfe,0x05,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], s[0:1], v[2:3] ; encoding: [0x00,0x04,0x0a,0x04] 0x00,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], s[0:1], v[2:3] ; encoding: [0x00,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], s[104:105], v[2:3] ; encoding: [0x68,0x04,0x0a,0x04] 0x68,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], s[104:105], v[2:3] ; encoding: [0x68,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], vcc, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x04] 0x6a,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], vcc, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x0a,0x04] 0x7a,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], exec, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x04] 0x7e,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], exec, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x04] 0x7c,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x04] 0xc1,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x04] 0xf0,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x04] 0xfd,0x04,0x0a,0x04 +# GFX12: v_add_f64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x04] -# GFX12: v_add_f64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x05,0x56,0x34,0x12,0xaf] 0xff,0xfc,0xfd,0x05,0x56,0x34,0x12,0xaf +# GFX12: v_add_f64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x05,0x56,0x34,0x12,0xaf] -# GFX12: v_add_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4a] 0x01,0x05,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4a] 0xff,0x05,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4a] 0x01,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4a] 0x69,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4a] 0x6a,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4a] 0x6b,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4a] 0x7b,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4a] 0x7d,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4a] 0x7e,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4a] 0x7f,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4a] 0x7c,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4a] 0xc1,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4a] 0xf0,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4a] 0xfd,0x04,0x0a,0x4a +# GFX12: v_add_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4a] -# GFX12: v_add_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x4b,0x56,0x34,0x12,0xaf +# GFX12: v_add_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4b,0x56,0x34,0x12,0xaf] -# GFX12: v_and_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x36] 0x01,0x05,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x36] 0xff,0x05,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x36] 0x01,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x36] 0x69,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x36] 0x6a,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x36] 0x6b,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x36] 0x7b,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x36] 0x7d,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x36] 0x7e,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x36] 0x7f,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x36] 0x7c,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x36] 0xc1,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x36] 0xf0,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x36] 0xfd,0x04,0x0a,0x36 +# GFX12: v_and_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x36] -# GFX12: v_and_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x37,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x37,0x56,0x34,0x12,0xaf +# GFX12: v_and_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x37,0x56,0x34,0x12,0xaf] -# GFX12: v_ashrrev_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x34] 0x01,0x05,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x34] 0xff,0x05,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x34] 0x01,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x34] 0x69,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x34] 0x6a,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x34] 0x6b,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x34] 0x7b,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x34] 0x7d,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x34] 0x7e,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x34] 0x7f,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x34] 0x7c,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x34] 0xc1,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x34] 0xf0,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x34] 0xfd,0x04,0x0a,0x34 +# GFX12: v_ashrrev_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x34] -# GFX12: v_ashrrev_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x35,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x35,0x56,0x34,0x12,0xaf +# GFX12: v_ashrrev_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x35,0x56,0x34,0x12,0xaf] -# W32: v_cndmask_b32_e32 v5, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x02] 0x01,0x05,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x02] 0xff,0x05,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x02] 0x01,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x02] 0x69,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x02] 0x6a,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x02] 0x6b,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x02] 0x7b,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x02] 0x7d,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x02] 0x7e,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x02] 0x7f,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x02] 0x7c,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x02] 0xc1,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x02] 0xf0,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x02] -# W32: v_cndmask_b32_e32 v5, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x02] -# W64: v_cndmask_b32_e32 v5, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x02] 0xfd,0x04,0x0a,0x02 +# W32: v_cndmask_b32_e32 v5, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x02] +# W64: v_cndmask_b32_e32 v5, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x02] +0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf # W32: v_cndmask_b32_e32 v255, 0xaf123456, v255, vcc_lo ; encoding: [0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf] # W64: v_cndmask_b32_e32 v255, 0xaf123456, v255, vcc ; encoding: [0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf] -0xff,0xfe,0xff,0x03,0x56,0x34,0x12,0xaf -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x5e] 0x01,0x05,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x5e] 0xff,0x05,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x5e] 0x01,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x5e] 0x69,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x5e] 0x6a,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x5e] 0x6b,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x5e] 0x7b,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x5e] 0x7d,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x5e] 0x7e,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x5e] 0x7f,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x5e] 0x7c,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x5e] 0xc1,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x5e] 0xf0,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x5e] 0xfd,0x04,0x0a,0x5e +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x5e] -# GFX12: v_cvt_pk_rtz_f16_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf +# GFX12: v_cvt_pk_rtz_f16_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x5f,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f16 v5, v1, v2, 0xfe0b ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, v1, v2, 0xfe0b ; encoding: [0x01,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, v127, v2, 0xfe0b ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, v127, v2, 0xfe0b ; encoding: [0x7f,0x05,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, s1, v2, 0xfe0b ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, s1, v2, 0xfe0b ; encoding: [0x01,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, s105, v2, 0xfe0b ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, s105, v2, 0xfe0b ; encoding: [0x69,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, vcc_lo, v2, 0xfe0b ; encoding: [0x6a,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, vcc_hi, v2, 0xfe0b ; encoding: [0x6b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, ttmp15, v2, 0xfe0b ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, ttmp15, v2, 0xfe0b ; encoding: [0x7b,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, m0, v2, 0xfe0b ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, m0, v2, 0xfe0b ; encoding: [0x7d,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, exec_lo, v2, 0xfe0b ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, exec_lo, v2, 0xfe0b ; encoding: [0x7e,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, exec_hi, v2, 0xfe0b ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, exec_hi, v2, 0xfe0b ; encoding: [0x7f,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, null, v2, 0xfe0b ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, null, v2, 0xfe0b ; encoding: [0x7c,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, -1, v2, 0xfe0b ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, -1, v2, 0xfe0b ; encoding: [0xc1,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, 0.5, v2, 0xfe0b ; encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, 0.5, v2, 0xfe0b ; encoding: [0xf0,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v5, src_scc, v2, 0xfe0b ; encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] 0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v5, src_scc, v2, 0xfe0b ; encoding: [0xfd,0x04,0x0a,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b ; encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmaak_f16 v127, 0xfe0b, v127, 0xfe0b ; encoding: [0xff,0xfe,0xfe,0x70,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmaak_f32 v5, v1, v2, 0xaf123456 ; encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, v1, v2, 0xaf123456 ; encoding: [0x01,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, v255, v2, 0xaf123456 ; encoding: [0xff,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0xff,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, v255, v2, 0xaf123456 ; encoding: [0xff,0x05,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, s1, v2, 0xaf123456 ; encoding: [0x01,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x01,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, s1, v2, 0xaf123456 ; encoding: [0x01,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, s105, v2, 0xaf123456 ; encoding: [0x69,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x69,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, s105, v2, 0xaf123456 ; encoding: [0x69,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, vcc_lo, v2, 0xaf123456 ; encoding: [0x6a,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x6a,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, vcc_lo, v2, 0xaf123456 ; encoding: [0x6a,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, vcc_hi, v2, 0xaf123456 ; encoding: [0x6b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x6b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, vcc_hi, v2, 0xaf123456 ; encoding: [0x6b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, ttmp15, v2, 0xaf123456 ; encoding: [0x7b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, ttmp15, v2, 0xaf123456 ; encoding: [0x7b,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, m0, v2, 0xaf123456 ; encoding: [0x7d,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7d,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, m0, v2, 0xaf123456 ; encoding: [0x7d,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, exec_lo, v2, 0xaf123456 ; encoding: [0x7e,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7e,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, exec_lo, v2, 0xaf123456 ; encoding: [0x7e,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, exec_hi, v2, 0xaf123456 ; encoding: [0x7f,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7f,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, exec_hi, v2, 0xaf123456 ; encoding: [0x7f,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, null, v2, 0xaf123456 ; encoding: [0x7c,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0x7c,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, null, v2, 0xaf123456 ; encoding: [0x7c,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, -1, v2, 0xaf123456 ; encoding: [0xc1,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0xc1,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, -1, v2, 0xaf123456 ; encoding: [0xc1,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0xf0,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, 0.5, v2, 0xaf123456 ; encoding: [0xf0,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v5, src_scc, v2, 0xaf123456 ; encoding: [0xfd,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] 0xfd,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v5, src_scc, v2, 0xaf123456 ; encoding: [0xfd,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf] -# GFX12: v_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 ; encoding: [0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf +# GFX12: v_fmaak_f32 v255, 0xaf123456, v255, 0xaf123456 ; encoding: [0xff,0xfe,0xff,0x5b,0x56,0x34,0x12,0xaf] -# GFX12: v_fmac_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x6c] 0x01,0x05,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x6c] 0x7f,0x05,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x6c] 0x01,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x6c] 0x69,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x6c] 0x6a,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x6c] 0x6b,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x6c] 0x7b,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x6c] 0x7d,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x6c] 0x7e,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x6c] 0x7f,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x6c] 0x7c,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x6c] 0xc1,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x6c] 0xf0,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x6c] 0xfd,0x04,0x0a,0x6c +# GFX12: v_fmac_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x6c] -# GFX12: v_fmac_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmac_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6c,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmac_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x56] 0x01,0x05,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x56] 0xff,0x05,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x56] 0x01,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x56] 0x69,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x56] 0x6a,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x56] 0x6b,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x56] 0x7b,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x56] 0x7d,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x56] 0x7e,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x56] 0x7f,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x56] 0x7c,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x56] 0xc1,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x56] 0xf0,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x56] 0xfd,0x04,0x0a,0x56 +# GFX12: v_fmac_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x56] -# GFX12: v_fmac_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf +# GFX12: v_fmac_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x57,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f16 v5, v1, 0xfe0b, v3 ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, v1, 0xfe0b, v3 ; encoding: [0x01,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, v127, 0xfe0b, v3 ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, v127, 0xfe0b, v3 ; encoding: [0x7f,0x07,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, s1, 0xfe0b, v3 ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, s1, 0xfe0b, v3 ; encoding: [0x01,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, s105, 0xfe0b, v3 ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, s105, 0xfe0b, v3 ; encoding: [0x69,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3 ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, vcc_lo, 0xfe0b, v3 ; encoding: [0x6a,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3 ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, vcc_hi, 0xfe0b, v3 ; encoding: [0x6b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, ttmp15, 0xfe0b, v3 ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, ttmp15, 0xfe0b, v3 ; encoding: [0x7b,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, m0, 0xfe0b, v3 ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, m0, 0xfe0b, v3 ; encoding: [0x7d,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, exec_lo, 0xfe0b, v3 ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, exec_lo, 0xfe0b, v3 ; encoding: [0x7e,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, exec_hi, 0xfe0b, v3 ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, exec_hi, 0xfe0b, v3 ; encoding: [0x7f,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, null, 0xfe0b, v3 ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, null, 0xfe0b, v3 ; encoding: [0x7c,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, -1, 0xfe0b, v3 ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, -1, 0xfe0b, v3 ; encoding: [0xc1,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, 0.5, 0xfe0b, v3 ; encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, 0.5, 0xfe0b, v3 ; encoding: [0xf0,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v5, src_scc, 0xfe0b, v3 ; encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] 0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v5, src_scc, 0xfe0b, v3 ; encoding: [0xfd,0x06,0x0a,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00 +# GFX12: v_fmamk_f16 v127, 0xfe0b, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6e,0x0b,0xfe,0x00,0x00] -# GFX12: v_fmamk_f32 v5, v1, 0xaf123456, v3 ; encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, v1, 0xaf123456, v3 ; encoding: [0x01,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, v255, 0xaf123456, v3 ; encoding: [0xff,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] 0xff,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, v255, 0xaf123456, v3 ; encoding: [0xff,0x07,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, s1, 0xaf123456, v3 ; encoding: [0x01,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x01,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, s1, 0xaf123456, v3 ; encoding: [0x01,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, s105, 0xaf123456, v3 ; encoding: [0x69,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x69,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, s105, 0xaf123456, v3 ; encoding: [0x69,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, vcc_lo, 0xaf123456, v3 ; encoding: [0x6a,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x6a,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, vcc_lo, 0xaf123456, v3 ; encoding: [0x6a,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, vcc_hi, 0xaf123456, v3 ; encoding: [0x6b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x6b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, vcc_hi, 0xaf123456, v3 ; encoding: [0x6b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, ttmp15, 0xaf123456, v3 ; encoding: [0x7b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, ttmp15, 0xaf123456, v3 ; encoding: [0x7b,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, m0, 0xaf123456, v3 ; encoding: [0x7d,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7d,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, m0, 0xaf123456, v3 ; encoding: [0x7d,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, exec_lo, 0xaf123456, v3 ; encoding: [0x7e,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7e,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, exec_lo, 0xaf123456, v3 ; encoding: [0x7e,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, exec_hi, 0xaf123456, v3 ; encoding: [0x7f,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7f,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, exec_hi, 0xaf123456, v3 ; encoding: [0x7f,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, null, 0xaf123456, v3 ; encoding: [0x7c,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0x7c,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, null, 0xaf123456, v3 ; encoding: [0x7c,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, -1, 0xaf123456, v3 ; encoding: [0xc1,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0xc1,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, -1, 0xaf123456, v3 ; encoding: [0xc1,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, 0.5, 0xaf123456, v3 ; encoding: [0xf0,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0xf0,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, 0.5, 0xaf123456, v3 ; encoding: [0xf0,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v5, src_scc, 0xaf123456, v3 ; encoding: [0xfd,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] 0xfd,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v5, src_scc, 0xaf123456, v3 ; encoding: [0xfd,0x06,0x0a,0x58,0x56,0x34,0x12,0xaf] -# GFX12: v_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf +# GFX12: v_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf] -# GFX12: v_ldexp_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x76] 0x01,0x05,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x76] 0x7f,0x05,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x76] 0x01,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x76] 0x69,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x76] 0x6a,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x76] 0x6b,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x76] 0x7b,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x76] 0x7d,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x76] 0x7e,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x76] 0x7f,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x76] 0x7c,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x76] 0xc1,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x76] 0xf0,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x76] 0xfd,0x04,0x0a,0x76 +# GFX12-REAL16: v_ldexp_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x76] +# GFX12-FAKE16: v_ldexp_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x76] -# GFX12: v_ldexp_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_ldexp_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_ldexp_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00] -# GFX12: v_lshlrev_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x30] 0x01,0x05,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x30] 0xff,0x05,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x30] 0x01,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x30] 0x69,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x30] 0x6a,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x30] 0x6b,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x30] 0x7b,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x30] 0x7d,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x30] 0x7e,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x30] 0x7f,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x30] 0x7c,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x30] 0xc1,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x30] 0xf0,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x30] 0xfd,0x04,0x0a,0x30 +# GFX12: v_lshlrev_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x30] -# GFX12: v_lshlrev_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x31,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x31,0x56,0x34,0x12,0xaf +# GFX12: v_lshlrev_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x31,0x56,0x34,0x12,0xaf] -# GFX12: v_lshlrev_b64_e32 v[5:6], v1, v[3:4] ; encoding: [0x01,0x07,0x0a,0x3e] 0x01,0x07,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], v1, v[3:4] ; encoding: [0x01,0x07,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], v255, v[2:3] ; encoding: [0xff,0x05,0x0a,0x3e] 0xff,0x05,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], v255, v[2:3] ; encoding: [0xff,0x05,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], s1, v[2:3] ; encoding: [0x01,0x04,0x0a,0x3e] 0x01,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], s1, v[2:3] ; encoding: [0x01,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], s105, v[2:3] ; encoding: [0x69,0x04,0x0a,0x3e] 0x69,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], s105, v[2:3] ; encoding: [0x69,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], vcc_lo, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x3e] 0x6a,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], vcc_lo, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], vcc_hi, v[2:3] ; encoding: [0x6b,0x04,0x0a,0x3e] 0x6b,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], vcc_hi, v[2:3] ; encoding: [0x6b,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], ttmp15, v[2:3] ; encoding: [0x7b,0x04,0x0a,0x3e] 0x7b,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], ttmp15, v[2:3] ; encoding: [0x7b,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], exec_lo, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x3e] 0x7e,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], exec_lo, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], exec_hi, v[2:3] ; encoding: [0x7f,0x04,0x0a,0x3e] 0x7f,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], exec_hi, v[2:3] ; encoding: [0x7f,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x3e] 0x7c,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x3e] 0xc1,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x3e] 0xf0,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x3e] 0xfd,0x04,0x0a,0x3e +# GFX12: v_lshlrev_b64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x3e] -# GFX12: v_lshlrev_b64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x3f,0x56,0x34,0x12,0xaf] 0xff,0xfc,0xfd,0x3f,0x56,0x34,0x12,0xaf +# GFX12: v_lshlrev_b64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x3f,0x56,0x34,0x12,0xaf] -# GFX12: v_lshrrev_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x32] 0x01,0x05,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x32] 0xff,0x05,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x32] 0x01,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x32] 0x69,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x32] 0x6a,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x32] 0x6b,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x32] 0x7b,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x32] 0x7d,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x32] 0x7e,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x32] 0x7f,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x32] 0x7c,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x32] 0xc1,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x32] 0xf0,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x32] 0xfd,0x04,0x0a,0x32 +# GFX12: v_lshrrev_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x32] -# GFX12: v_lshrrev_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x33,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x33,0x56,0x34,0x12,0xaf +# GFX12: v_lshrrev_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x33,0x56,0x34,0x12,0xaf] -# GFX12: v_max_num_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x62] 0x01,0x05,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x62] 0x7f,0x05,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x62] 0x01,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x62] 0x69,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x62] 0x6a,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x62] 0x6b,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x62] 0x7b,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x62] 0x7d,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x62] 0x7e,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x62] 0x7f,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x62] 0x7c,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x62] 0xc1,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x62] 0xf0,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x62] 0xfd,0x04,0x0a,0x62 +# GFX12-REAL16: v_max_num_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x62] +# GFX12-FAKE16: v_max_num_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x62] -# GFX12: v_max_num_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x62,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x62,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_max_num_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x62,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_max_num_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x62,0x0b,0xfe,0x00,0x00] -# GFX12: v_max_num_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2c] 0x01,0x05,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x2c] 0xff,0x05,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x2c] 0x01,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x2c] 0x69,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x2c] 0x6a,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x2c] 0x6b,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x2c] 0x7b,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x2c] 0x7d,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x2c] 0x7e,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x2c] 0x7f,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x2c] 0x7c,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x2c] 0xc1,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x2c] 0xf0,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x2c] 0xfd,0x04,0x0a,0x2c +# GFX12: v_max_num_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x2c] -# GFX12: v_max_num_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x2d,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x2d,0x56,0x34,0x12,0xaf +# GFX12: v_max_num_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x2d,0x56,0x34,0x12,0xaf] -# GFX12: v_max_num_f64_e32 v[5:6], v[1:2], v[3:4] ; encoding: [0x01,0x07,0x0a,0x1c] 0x01,0x07,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], v[1:2], v[3:4] ; encoding: [0x01,0x07,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x0a,0x1c] 0xfe,0x05,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], s[0:1], v[2:3] ; encoding: [0x00,0x04,0x0a,0x1c] 0x00,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], s[0:1], v[2:3] ; encoding: [0x00,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], s[104:105], v[2:3] ; encoding: [0x68,0x04,0x0a,0x1c] 0x68,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], s[104:105], v[2:3] ; encoding: [0x68,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], vcc, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x1c] 0x6a,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], vcc, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x0a,0x1c] 0x7a,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], exec, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x1c] 0x7e,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], exec, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x1c] 0x7c,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x1c] 0xc1,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x1c] 0xf0,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x1c] 0xfd,0x04,0x0a,0x1c +# GFX12: v_max_num_f64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x1c] -# GFX12: v_max_num_f64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x1d,0x56,0x34,0x12,0xaf] 0xff,0xfc,0xfd,0x1d,0x56,0x34,0x12,0xaf +# GFX12: v_max_num_f64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x1d,0x56,0x34,0x12,0xaf] -# GFX12: v_max_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x24] 0x01,0x05,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x24] 0xff,0x05,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x24] 0x01,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x24] 0x69,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x24] 0x6a,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x24] 0x6b,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x24] 0x7b,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x24] 0x7d,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x24] 0x7e,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x24] 0x7f,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x24] 0x7c,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x24] 0xc1,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x24] 0xf0,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x24] 0xfd,0x04,0x0a,0x24 +# GFX12: v_max_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x24] -# GFX12: v_max_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x25,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x25,0x56,0x34,0x12,0xaf +# GFX12: v_max_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x25,0x56,0x34,0x12,0xaf] -# GFX12: v_max_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x28] 0x01,0x05,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x28] 0xff,0x05,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x28] 0x01,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x28] 0x69,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x28] 0x6a,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x28] 0x6b,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x28] 0x7b,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x28] 0x7d,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x28] 0x7e,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x28] 0x7f,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x28] 0x7c,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x28] 0xc1,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x28] 0xf0,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x28] 0xfd,0x04,0x0a,0x28 +# GFX12: v_max_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x28] -# GFX12: v_max_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x29,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x29,0x56,0x34,0x12,0xaf +# GFX12: v_max_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x29,0x56,0x34,0x12,0xaf] -# GFX12: v_min_num_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x60] 0x01,0x05,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x60] 0x7f,0x05,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x60] 0x01,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x60] 0x69,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x60] 0x6a,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x60] 0x6b,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x60] 0x7b,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x60] 0x7d,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x60] 0x7e,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x60] 0x7f,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x60] 0x7c,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x60] 0xc1,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x60] 0xf0,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x60] 0xfd,0x04,0x0a,0x60 +# GFX12-REAL16: v_min_num_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x60] +# GFX12-FAKE16: v_min_num_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x60] -# GFX12: v_min_num_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x60,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x60,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_min_num_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x60,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_min_num_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x60,0x0b,0xfe,0x00,0x00] -# GFX12: v_min_num_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2a] 0x01,0x05,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x2a] 0xff,0x05,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x2a] 0x01,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x2a] 0x69,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x2a] 0x6a,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x2a] 0x6b,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x2a] 0x7b,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x2a] 0x7d,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x2a] 0x7e,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x2a] 0x7f,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x2a] 0x7c,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x2a] 0xc1,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x2a] 0xf0,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x2a] 0xfd,0x04,0x0a,0x2a +# GFX12: v_min_num_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x2a] -# GFX12: v_min_num_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x2b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x2b,0x56,0x34,0x12,0xaf +# GFX12: v_min_num_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x2b,0x56,0x34,0x12,0xaf] -# GFX12: v_min_num_f64_e32 v[5:6], v[1:2], v[3:4] ; encoding: [0x01,0x07,0x0a,0x1a] 0x01,0x07,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], v[1:2], v[3:4] ; encoding: [0x01,0x07,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x0a,0x1a] 0xfe,0x05,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], s[0:1], v[2:3] ; encoding: [0x00,0x04,0x0a,0x1a] 0x00,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], s[0:1], v[2:3] ; encoding: [0x00,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], s[104:105], v[2:3] ; encoding: [0x68,0x04,0x0a,0x1a] 0x68,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], s[104:105], v[2:3] ; encoding: [0x68,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], vcc, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x1a] 0x6a,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], vcc, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x0a,0x1a] 0x7a,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], exec, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x1a] 0x7e,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], exec, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x1a] 0x7c,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x1a] 0xc1,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x1a] 0xf0,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x1a] 0xfd,0x04,0x0a,0x1a +# GFX12: v_min_num_f64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x1a] -# GFX12: v_min_num_f64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x1b,0x56,0x34,0x12,0xaf] 0xff,0xfc,0xfd,0x1b,0x56,0x34,0x12,0xaf +# GFX12: v_min_num_f64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x1b,0x56,0x34,0x12,0xaf] -# GFX12: v_min_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x22] 0x01,0x05,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x22] 0xff,0x05,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x22] 0x01,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x22] 0x69,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x22] 0x6a,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x22] 0x6b,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x22] 0x7b,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x22] 0x7d,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x22] 0x7e,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x22] 0x7f,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x22] 0x7c,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x22] 0xc1,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x22] 0xf0,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x22] 0xfd,0x04,0x0a,0x22 +# GFX12: v_min_i32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x22] -# GFX12: v_min_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x23,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x23,0x56,0x34,0x12,0xaf +# GFX12: v_min_i32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x23,0x56,0x34,0x12,0xaf] -# GFX12: v_min_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x26] 0x01,0x05,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x26] 0xff,0x05,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x26] 0x01,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x26] 0x69,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x26] 0x6a,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x26] 0x6b,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x26] 0x7b,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x26] 0x7d,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x26] 0x7e,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x26] 0x7f,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x26] 0x7c,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x26] 0xc1,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x26] 0xf0,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x26] 0xfd,0x04,0x0a,0x26 +# GFX12: v_min_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x26] -# GFX12: v_min_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x27,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x27,0x56,0x34,0x12,0xaf +# GFX12: v_min_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x27,0x56,0x34,0x12,0xaf] -# GFX12: v_mul_dx9_zero_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0e] 0x01,0x05,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0e] 0xff,0x05,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0e] 0x01,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0e] 0x69,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0e] 0x6a,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0e] 0x6b,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0e] 0x7b,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0e] 0x7d,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0e] 0x7e,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0e] 0x7f,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0e] 0x7c,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0e] 0xc1,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0e] 0xf0,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0e] 0xfd,0x04,0x0a,0x0e +# GFX12: v_mul_dx9_zero_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0e] -# GFX12: v_mul_dx9_zero_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf +# GFX12: v_mul_dx9_zero_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0f,0x56,0x34,0x12,0xaf] -# GFX12: v_mul_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x6a] 0x01,0x05,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x6a] 0x7f,0x05,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x6a] 0x01,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x6a] 0x69,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x6a] 0x6a,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x6a] 0x6b,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x6a] 0x7b,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x6a] 0x7d,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x6a] 0x7e,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x6a] 0x7f,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x6a] 0x7c,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x6a] 0xc1,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x6a] 0xf0,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x6a] 0xfd,0x04,0x0a,0x6a +# GFX12-REAL16: v_mul_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x6a] +# GFX12-FAKE16: v_mul_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x6a] -# GFX12: v_mul_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_mul_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_mul_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x6a,0x0b,0xfe,0x00,0x00] -# GFX12: v_mul_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x10] 0x01,0x05,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x10] 0xff,0x05,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x10] 0x01,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x10] 0x69,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x10] 0x6a,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x10] 0x6b,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x10] 0x7b,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x10] 0x7d,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x10] 0x7e,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x10] 0x7f,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x10] 0x7c,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x10] 0xc1,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x10] 0xf0,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x10] 0xfd,0x04,0x0a,0x10 +# GFX12: v_mul_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x10] -# GFX12: v_mul_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x11,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x11,0x56,0x34,0x12,0xaf +# GFX12: v_mul_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x11,0x56,0x34,0x12,0xaf] -# GFX12: v_mul_f64_e32 v[5:6], v[1:2], v[3:4] ; encoding: [0x01,0x07,0x0a,0x0c] 0x01,0x07,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], v[1:2], v[3:4] ; encoding: [0x01,0x07,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x0a,0x0c] 0xfe,0x05,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], v[254:255], v[2:3] ; encoding: [0xfe,0x05,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], s[0:1], v[2:3] ; encoding: [0x00,0x04,0x0a,0x0c] 0x00,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], s[0:1], v[2:3] ; encoding: [0x00,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], s[104:105], v[2:3] ; encoding: [0x68,0x04,0x0a,0x0c] 0x68,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], s[104:105], v[2:3] ; encoding: [0x68,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], vcc, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x0c] 0x6a,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], vcc, v[2:3] ; encoding: [0x6a,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x0a,0x0c] 0x7a,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], ttmp[14:15], v[2:3] ; encoding: [0x7a,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], exec, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x0c] 0x7e,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], exec, v[2:3] ; encoding: [0x7e,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x0c] 0x7c,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], null, v[2:3] ; encoding: [0x7c,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x0c] 0xc1,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], -1, v[2:3] ; encoding: [0xc1,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x0c] 0xf0,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], 0.5, v[2:3] ; encoding: [0xf0,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x0c] 0xfd,0x04,0x0a,0x0c +# GFX12: v_mul_f64_e32 v[5:6], src_scc, v[2:3] ; encoding: [0xfd,0x04,0x0a,0x0c] -# GFX12: v_mul_f64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x0d,0x56,0x34,0x12,0xaf] 0xff,0xfc,0xfd,0x0d,0x56,0x34,0x12,0xaf +# GFX12: v_mul_f64_e32 v[254:255], 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xfd,0x0d,0x56,0x34,0x12,0xaf] -# GFX12: v_mul_hi_i32_i24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x14] 0x01,0x05,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x14] 0xff,0x05,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x14] 0x01,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x14] 0x69,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x14] 0x6a,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x14] 0x6b,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x14] 0x7b,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x14] 0x7d,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x14] 0x7e,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x14] 0x7f,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x14] 0x7c,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x14] 0xc1,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x14] 0xf0,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x14] 0xfd,0x04,0x0a,0x14 +# GFX12: v_mul_hi_i32_i24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x14] -# GFX12: v_mul_hi_i32_i24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x15,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x15,0x56,0x34,0x12,0xaf +# GFX12: v_mul_hi_i32_i24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x15,0x56,0x34,0x12,0xaf] -# GFX12: v_mul_hi_u32_u24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x18] 0x01,0x05,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x18] 0xff,0x05,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x18] 0x01,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x18] 0x69,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x18] 0x6a,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x18] 0x6b,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x18] 0x7b,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x18] 0x7d,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x18] 0x7e,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x18] 0x7f,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x18] 0x7c,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x18] 0xc1,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x18] 0xf0,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x18] 0xfd,0x04,0x0a,0x18 +# GFX12: v_mul_hi_u32_u24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x18] -# GFX12: v_mul_hi_u32_u24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x19,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x19,0x56,0x34,0x12,0xaf +# GFX12: v_mul_hi_u32_u24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x19,0x56,0x34,0x12,0xaf] -# GFX12: v_mul_i32_i24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x12] 0x01,0x05,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x12] 0xff,0x05,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x12] 0x01,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x12] 0x69,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x12] 0x6a,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x12] 0x6b,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x12] 0x7b,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x12] 0x7d,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x12] 0x7e,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x12] 0x7f,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x12] 0x7c,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x12] 0xc1,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x12] 0xf0,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x12] 0xfd,0x04,0x0a,0x12 +# GFX12: v_mul_i32_i24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x12] -# GFX12: v_mul_i32_i24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x13,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x13,0x56,0x34,0x12,0xaf +# GFX12: v_mul_i32_i24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x13,0x56,0x34,0x12,0xaf] -# GFX12: v_mul_u32_u24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x16] 0x01,0x05,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x16] 0xff,0x05,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x16] 0x01,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x16] 0x69,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x16] 0x6a,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x16] 0x6b,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x16] 0x7b,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x16] 0x7d,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x16] 0x7e,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x16] 0x7f,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x16] 0x7c,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x16] 0xc1,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x16] 0xf0,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x16] 0xfd,0x04,0x0a,0x16 +# GFX12: v_mul_u32_u24_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x16] -# GFX12: v_mul_u32_u24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x17,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x17,0x56,0x34,0x12,0xaf +# GFX12: v_mul_u32_u24_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x17,0x56,0x34,0x12,0xaf] -# GFX12: v_or_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x38] 0x01,0x05,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x38] 0xff,0x05,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x38] 0x01,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x38] 0x69,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x38] 0x6a,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x38] 0x6b,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x38] 0x7b,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x38] 0x7d,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x38] 0x7e,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x38] 0x7f,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x38] 0x7c,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x38] 0xc1,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x38] 0xf0,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x38] 0xfd,0x04,0x0a,0x38 +# GFX12: v_or_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x38] -# GFX12: v_or_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x39,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x39,0x56,0x34,0x12,0xaf +# GFX12: v_or_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x39,0x56,0x34,0x12,0xaf] -# GFX12: v_pk_fmac_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] 0x01,0x05,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] 0xff,0x05,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] 0x01,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x78] 0x69,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] 0x6a,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] 0x6b,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x78] 0x7b,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x78] 0x7d,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] 0x7e,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] 0x7f,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x78] 0x7c,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] 0xc1,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] 0xf0,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x78] 0xfd,0x04,0x0a,0x78 +# GFX12: v_pk_fmac_f16 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x78] -# GFX12: v_pk_fmac_f16 v255, 0xfe0b, v255 ; encoding: [0xff,0xfe,0xff,0x79,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xff,0x79,0x0b,0xfe,0x00,0x00 +# GFX12: v_pk_fmac_f16 v255, 0xfe0b, v255 ; encoding: [0xff,0xfe,0xff,0x79,0x0b,0xfe,0x00,0x00] +0x01,0x05,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x42] -0x01,0x05,0x0a,0x42 +0xff,0x05,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x42] -0xff,0x05,0x0a,0x42 +0x01,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x42] -0x01,0x04,0x0a,0x42 +0x69,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x42] -0x69,0x04,0x0a,0x42 +0x6a,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x42] -0x6a,0x04,0x0a,0x42 +0x6b,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x42] -0x6b,0x04,0x0a,0x42 +0x7b,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x42] -0x7b,0x04,0x0a,0x42 +0x7d,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x42] -0x7d,0x04,0x0a,0x42 +0x7e,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x42] -0x7e,0x04,0x0a,0x42 +0x7f,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x42] -0x7f,0x04,0x0a,0x42 +0x7c,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x42] -0x7c,0x04,0x0a,0x42 +0xc1,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x42] -0xc1,0x04,0x0a,0x42 +0xf0,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x42] -0xf0,0x04,0x0a,0x42 +0xfd,0x04,0x0a,0x42 # W32: v_sub_co_ci_u32_e32 v5, vcc_lo, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x42] # W64: v_sub_co_ci_u32_e32 v5, vcc, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x42] -0xfd,0x04,0x0a,0x42 +0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf # W32: v_sub_co_ci_u32_e32 v255, vcc_lo, 0xaf123456, v255, vcc_lo ; encoding: [0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf] # W64: v_sub_co_ci_u32_e32 v255, vcc, 0xaf123456, v255, vcc ; encoding: [0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf] -0xff,0xfe,0xff,0x43,0x56,0x34,0x12,0xaf -# GFX12: v_sub_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x66] 0x01,0x05,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x66] 0x7f,0x05,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x66] 0x01,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x66] 0x69,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x66] 0x6a,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x66] 0x6b,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x66] 0x7b,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x66] 0x7d,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x66] 0x7e,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x66] 0x7f,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x66] 0x7c,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x66] 0xc1,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x66] 0xf0,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x66] 0xfd,0x04,0x0a,0x66 +# GFX12-REAL16: v_sub_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x66] +# GFX12-FAKE16: v_sub_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x66] -# GFX12: v_sub_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_sub_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_sub_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x66,0x0b,0xfe,0x00,0x00] -# GFX12: v_sub_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x08] 0x01,0x05,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x08] 0xff,0x05,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x08] 0x01,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x08] 0x69,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x08] 0x6a,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x08] 0x6b,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x08] 0x7b,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x08] 0x7d,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x08] 0x7e,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x08] 0x7f,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x08] 0x7c,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x08] 0xc1,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x08] 0xf0,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x08] 0xfd,0x04,0x0a,0x08 +# GFX12: v_sub_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x08] -# GFX12: v_sub_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x09,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x09,0x56,0x34,0x12,0xaf +# GFX12: v_sub_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x09,0x56,0x34,0x12,0xaf] -# GFX12: v_sub_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4c] 0x01,0x05,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4c] 0xff,0x05,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4c] 0x01,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4c] 0x69,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4c] 0x6a,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4c] 0x6b,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4c] 0x7b,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4c] 0x7d,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4c] 0x7e,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4c] 0x7f,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4c] 0x7c,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4c] 0xc1,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4c] 0xf0,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4c] 0xfd,0x04,0x0a,0x4c +# GFX12: v_sub_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4c] -# GFX12: v_sub_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4d,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x4d,0x56,0x34,0x12,0xaf +# GFX12: v_sub_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4d,0x56,0x34,0x12,0xaf] +0x01,0x05,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0x0a,0x44] -0x01,0x05,0x0a,0x44 +0xff,0x05,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, v255, v2, vcc_lo ; encoding: [0xff,0x05,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, v255, v2, vcc ; encoding: [0xff,0x05,0x0a,0x44] -0xff,0x05,0x0a,0x44 +0x01,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v2, vcc_lo ; encoding: [0x01,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, s1, v2, vcc ; encoding: [0x01,0x04,0x0a,0x44] -0x01,0x04,0x0a,0x44 +0x69,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, s105, v2, vcc_lo ; encoding: [0x69,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, s105, v2, vcc ; encoding: [0x69,0x04,0x0a,0x44] -0x69,0x04,0x0a,0x44 +0x6a,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, vcc_lo, v2, vcc_lo ; encoding: [0x6a,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, vcc_lo, v2, vcc ; encoding: [0x6a,0x04,0x0a,0x44] -0x6a,0x04,0x0a,0x44 +0x6b,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, vcc_hi, v2, vcc_lo ; encoding: [0x6b,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, vcc_hi, v2, vcc ; encoding: [0x6b,0x04,0x0a,0x44] -0x6b,0x04,0x0a,0x44 +0x7b,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, ttmp15, v2, vcc_lo ; encoding: [0x7b,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, ttmp15, v2, vcc ; encoding: [0x7b,0x04,0x0a,0x44] -0x7b,0x04,0x0a,0x44 +0x7d,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, m0, v2, vcc_lo ; encoding: [0x7d,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, m0, v2, vcc ; encoding: [0x7d,0x04,0x0a,0x44] -0x7d,0x04,0x0a,0x44 +0x7e,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, exec_lo, v2, vcc_lo ; encoding: [0x7e,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, exec_lo, v2, vcc ; encoding: [0x7e,0x04,0x0a,0x44] -0x7e,0x04,0x0a,0x44 +0x7f,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, exec_hi, v2, vcc_lo ; encoding: [0x7f,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, exec_hi, v2, vcc ; encoding: [0x7f,0x04,0x0a,0x44] -0x7f,0x04,0x0a,0x44 +0x7c,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, null, v2, vcc_lo ; encoding: [0x7c,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, null, v2, vcc ; encoding: [0x7c,0x04,0x0a,0x44] -0x7c,0x04,0x0a,0x44 +0xc1,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, -1, v2, vcc_lo ; encoding: [0xc1,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, -1, v2, vcc ; encoding: [0xc1,0x04,0x0a,0x44] -0xc1,0x04,0x0a,0x44 +0xf0,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, 0.5, v2, vcc_lo ; encoding: [0xf0,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, 0.5, v2, vcc ; encoding: [0xf0,0x04,0x0a,0x44] -0xf0,0x04,0x0a,0x44 +0xfd,0x04,0x0a,0x44 # W32: v_subrev_co_ci_u32_e32 v5, vcc_lo, src_scc, v2, vcc_lo ; encoding: [0xfd,0x04,0x0a,0x44] # W64: v_subrev_co_ci_u32_e32 v5, vcc, src_scc, v2, vcc ; encoding: [0xfd,0x04,0x0a,0x44] -0xfd,0x04,0x0a,0x44 +0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf # W32: v_subrev_co_ci_u32_e32 v255, vcc_lo, 0xaf123456, v255, vcc_lo ; encoding: [0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf] # W64: v_subrev_co_ci_u32_e32 v255, vcc, 0xaf123456, v255, vcc ; encoding: [0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf] -0xff,0xfe,0xff,0x45,0x56,0x34,0x12,0xaf -# GFX12: v_subrev_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x68] 0x01,0x05,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, v1.l, v2.l ; encoding: [0x01,0x05,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x68] 0x7f,0x05,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, v127.l, v2.l ; encoding: [0x7f,0x05,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, v127, v2 ; encoding: [0x7f,0x05,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x68] 0x01,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, s1, v2.l ; encoding: [0x01,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x68] 0x69,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, s105, v2.l ; encoding: [0x69,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x68] 0x6a,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, vcc_lo, v2.l ; encoding: [0x6a,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x68] 0x6b,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, vcc_hi, v2.l ; encoding: [0x6b,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x68] 0x7b,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, ttmp15, v2.l ; encoding: [0x7b,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x68] 0x7d,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, m0, v2.l ; encoding: [0x7d,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x68] 0x7e,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, exec_lo, v2.l ; encoding: [0x7e,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x68] 0x7f,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, exec_hi, v2.l ; encoding: [0x7f,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x68] 0x7c,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, null, v2.l ; encoding: [0x7c,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x68] 0xc1,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, -1, v2.l ; encoding: [0xc1,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x68] 0xf0,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, 0.5, v2.l ; encoding: [0xf0,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x68] 0xfd,0x04,0x0a,0x68 +# GFX12-REAL16: v_subrev_f16_e32 v5.l, src_scc, v2.l ; encoding: [0xfd,0x04,0x0a,0x68] +# GFX12-FAKE16: v_subrev_f16_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x68] -# GFX12: v_subrev_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00] 0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00 +# GFX12-REAL16: v_subrev_f16_e32 v127.l, 0xfe0b, v127.l ; encoding: [0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00] +# GFX12-FAKE16: v_subrev_f16_e32 v127, 0xfe0b, v127 ; encoding: [0xff,0xfe,0xfe,0x68,0x0b,0xfe,0x00,0x00] -# GFX12: v_subrev_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0a] 0x01,0x05,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0a] 0xff,0x05,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0a] 0x01,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0a] 0x69,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0a] 0x6a,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0a] 0x6b,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0a] 0x7b,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0a] 0x7d,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0a] 0x7e,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0a] 0x7f,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0a] 0x7c,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0a] 0xc1,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0a] 0xf0,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0a] 0xfd,0x04,0x0a,0x0a +# GFX12: v_subrev_f32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x0a] -# GFX12: v_subrev_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x0b,0x56,0x34,0x12,0xaf +# GFX12: v_subrev_f32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x0b,0x56,0x34,0x12,0xaf] -# GFX12: v_subrev_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4e] 0x01,0x05,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4e] 0xff,0x05,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4e] 0x01,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4e] 0x69,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4e] 0x6a,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4e] 0x6b,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4e] 0x7b,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4e] 0x7d,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4e] 0x7e,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4e] 0x7f,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4e] 0x7c,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4e] 0xc1,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4e] 0xf0,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4e] 0xfd,0x04,0x0a,0x4e +# GFX12: v_subrev_nc_u32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x4e] -# GFX12: v_subrev_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4f,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x4f,0x56,0x34,0x12,0xaf +# GFX12: v_subrev_nc_u32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x4f,0x56,0x34,0x12,0xaf] -# GFX12: v_xnor_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x3c] 0x01,0x05,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x3c] 0xff,0x05,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x3c] 0x01,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x3c] 0x69,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x3c] 0x6a,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x3c] 0x6b,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x3c] 0x7b,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x3c] 0x7d,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x3c] 0x7e,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x3c] 0x7f,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x3c] 0x7c,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x3c] 0xc1,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x3c] 0xf0,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x3c] 0xfd,0x04,0x0a,0x3c +# GFX12: v_xnor_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x3c] -# GFX12: v_xnor_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x3d,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x3d,0x56,0x34,0x12,0xaf +# GFX12: v_xnor_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x3d,0x56,0x34,0x12,0xaf] -# GFX12: v_xor_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x3a] 0x01,0x05,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x3a] 0xff,0x05,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, v255, v2 ; encoding: [0xff,0x05,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x3a] 0x01,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, s1, v2 ; encoding: [0x01,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x3a] 0x69,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, s105, v2 ; encoding: [0x69,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x3a] 0x6a,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, vcc_lo, v2 ; encoding: [0x6a,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x3a] 0x6b,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, vcc_hi, v2 ; encoding: [0x6b,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x3a] 0x7b,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, ttmp15, v2 ; encoding: [0x7b,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x3a] 0x7d,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, m0, v2 ; encoding: [0x7d,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x3a] 0x7e,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, exec_lo, v2 ; encoding: [0x7e,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x3a] 0x7f,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, exec_hi, v2 ; encoding: [0x7f,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x3a] 0x7c,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, null, v2 ; encoding: [0x7c,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x3a] 0xc1,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, -1, v2 ; encoding: [0xc1,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x3a] 0xf0,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, 0.5, v2 ; encoding: [0xf0,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x3a] 0xfd,0x04,0x0a,0x3a +# GFX12: v_xor_b32_e32 v5, src_scc, v2 ; encoding: [0xfd,0x04,0x0a,0x3a] -# GFX12: v_xor_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x3b,0x56,0x34,0x12,0xaf] 0xff,0xfe,0xff,0x3b,0x56,0x34,0x12,0xaf +# GFX12: v_xor_b32_e32 v255, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x3b,0x56,0x34,0x12,0xaf] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt index 05c8dff02a40b..551fb0d311188 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp16.txt @@ -1,1696 +1,1797 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,GFX12-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,GFX12-FAKE16 %s +0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0xff +0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff] -0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xff +0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff] -0xfa,0x04,0x0a,0x40,0x01,0x50,0x01,0xff +0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01] -0xfa,0x04,0x0a,0x40,0x01,0x5f,0x01,0x01 +0xfa,0x04,0x0a,0x40,0x01,0x60,0x01,0x13 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x60,0x01,0x13] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x40,0x01,0x60,0x01,0x13] -0xfa,0x04,0x0a,0x40,0x01,0x60,0x01,0x13 +0xfa,0xfe,0xff,0x41,0xff,0x6f,0x0d,0x30 # W32: v_add_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x41,0xff,0x6f,0x0d,0x30] # W64: v_add_co_ci_u32_dpp v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x41,0xff,0x6f,0x0d,0x30] -0xfa,0xfe,0xff,0x41,0xff,0x6f,0x0d,0x30 -# GFX12: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x50,0x01,0xff] -# GFX12: v_add_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x5f,0x01,0x01] -# GFX12: v_add_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x64,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x64,0x01,0x60,0x01,0x13] -# GFX12: v_add_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xfd,0x30 +# GFX12-REAL16: v_add_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_add_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x64,0x7f,0x6f,0xfd,0x30] -# GFX12: v_add_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x06,0x01,0x50,0x01,0xff +# GFX12: v_add_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x50,0x01,0xff] -# GFX12: v_add_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x06,0x01,0x5f,0x01,0x01 +# GFX12: v_add_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x5f,0x01,0x01] -# GFX12: v_add_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x06,0x01,0x60,0x01,0x13 +# GFX12: v_add_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x06,0x01,0x60,0x01,0x13] -# GFX12: v_add_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x07,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x07,0xff,0x6f,0xfd,0x30 +# GFX12: v_add_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x07,0xff,0x6f,0xfd,0x30] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x1b,0x00,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1b,0x00,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x40,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x40,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x41,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x41,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x01,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x01,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x0f,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x0f,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x11,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x11,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x1f,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x1f,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x21,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x21,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x2f,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x2f,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x4a,0x01,0x50,0x01,0xff +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x50,0x01,0xff] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x4a,0x01,0x5f,0x01,0x01 +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x5f,0x01,0x01] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x4a,0x01,0x60,0x01,0x13 +# GFX12: v_add_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4a,0x01,0x60,0x01,0x13] -# GFX12: v_add_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4b,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x4b,0xff,0x6f,0x0d,0x30 +# GFX12: v_add_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4b,0xff,0x6f,0x0d,0x30] -# GFX12: v_and_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x1b,0x00,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1b,0x00,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x36,0x01,0xe4,0x00,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0xe4,0x00,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x40,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x40,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x41,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x41,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x01,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x01,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x0f,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x0f,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x11,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x11,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x1f,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x1f,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x21,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x21,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x2f,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x2f,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x36,0x01,0x50,0x01,0xff +# GFX12: v_and_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x50,0x01,0xff] -# GFX12: v_and_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x36,0x01,0x5f,0x01,0x01 +# GFX12: v_and_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x5f,0x01,0x01] -# GFX12: v_and_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x36,0x01,0x60,0x01,0x13 +# GFX12: v_and_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x36,0x01,0x60,0x01,0x13] -# GFX12: v_and_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x37,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x37,0xff,0x6f,0x0d,0x30 +# GFX12: v_and_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x37,0xff,0x6f,0x0d,0x30] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x1b,0x00,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1b,0x00,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x34,0x01,0xe4,0x00,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0xe4,0x00,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x40,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x40,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x41,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x41,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x01,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x01,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x0f,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x0f,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x11,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x11,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x1f,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x1f,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x21,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x21,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x2f,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x2f,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x34,0x01,0x50,0x01,0xff +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x50,0x01,0xff] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x34,0x01,0x5f,0x01,0x01 +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x5f,0x01,0x01] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x34,0x01,0x60,0x01,0x13 +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x34,0x01,0x60,0x01,0x13] -# GFX12: v_ashrrev_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x35,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x35,0xff,0x6f,0x0d,0x30 +# GFX12: v_ashrrev_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x35,0xff,0x6f,0x0d,0x30] +0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0xff +0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff] -0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xff +0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff] -0xfa,0x04,0x0a,0x02,0x01,0x50,0x01,0xff +0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01 # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01] -0xfa,0x04,0x0a,0x02,0x01,0x5f,0x01,0x01 +0xfa,0x04,0x0a,0x02,0x01,0x60,0x01,0x13 # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x60,0x01,0x13] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0x60,0x01,0x13] -0xfa,0x04,0x0a,0x02,0x01,0x60,0x01,0x13 +0xfa,0xfe,0xff,0x03,0xff,0x6f,0x0d,0x30 # W32: v_cndmask_b32_dpp v255, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x03,0xff,0x6f,0x0d,0x30] # W64: v_cndmask_b32_dpp v255, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x03,0xff,0x6f,0x0d,0x30] -0xfa,0xfe,0xff,0x03,0xff,0x6f,0x0d,0x30 -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x50,0x01,0xff] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01 +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x5f,0x01,0x01] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x5e,0x01,0x60,0x01,0x13 +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x5e,0x01,0x60,0x01,0x13] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xfd,0x30 +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x5f,0xff,0x6f,0xfd,0x30] -# GFX12: v_fmac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1b,0x00,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0xe4,0x00,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x40,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x41,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x01,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x0f,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x11,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x1f,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x21,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x2f,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x50,0x01,0xff] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01 +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x5f,0x01,0x01] -# GFX12: v_fmac_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13 +# GFX12: v_fmac_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6c,0x01,0x60,0x01,0x13] -# GFX12: v_fmac_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30 +# GFX12: v_fmac_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6c,0x7f,0x6f,0xfd,0x30] -# GFX12: v_fmac_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x56,0x01,0x50,0x01,0xff +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x50,0x01,0xff] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x56,0x01,0x5f,0x01,0x01 +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x5f,0x01,0x01] -# GFX12: v_fmac_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x56,0x01,0x60,0x01,0x13 +# GFX12: v_fmac_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x56,0x01,0x60,0x01,0x13] -# GFX12: v_fmac_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x57,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x57,0xff,0x6f,0xfd,0x30 +# GFX12: v_fmac_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x57,0xff,0x6f,0xfd,0x30] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x50,0x01,0xff] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x5f,0x01,0x01] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x76,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x76,0x01,0x60,0x01,0x13] -# GFX12: v_ldexp_f16_dpp v127, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x3d,0x30] 0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x3d,0x30 +# GFX12-REAL16: v_ldexp_f16_dpp v127.l, -|v127.l|, v127.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x3d,0x30] +# GFX12-FAKE16: v_ldexp_f16_dpp v127, -|v127|, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x76,0x7f,0x6f,0x3d,0x30] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x1b,0x00,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1b,0x00,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x30,0x01,0xe4,0x00,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0xe4,0x00,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x40,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x40,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x41,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x41,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x01,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x01,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x0f,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x0f,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x11,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x11,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x1f,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x1f,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x21,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x21,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x2f,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x2f,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x30,0x01,0x50,0x01,0xff +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x50,0x01,0xff] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x30,0x01,0x5f,0x01,0x01 +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x5f,0x01,0x01] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x30,0x01,0x60,0x01,0x13 +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x30,0x01,0x60,0x01,0x13] -# GFX12: v_lshlrev_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x31,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x31,0xff,0x6f,0x0d,0x30 +# GFX12: v_lshlrev_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x31,0xff,0x6f,0x0d,0x30] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x1b,0x00,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1b,0x00,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x32,0x01,0xe4,0x00,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0xe4,0x00,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x40,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x40,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x41,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x41,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x01,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x01,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x0f,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x0f,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x11,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x11,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x1f,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x1f,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x21,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x21,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x2f,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x2f,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x32,0x01,0x50,0x01,0xff +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x50,0x01,0xff] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x32,0x01,0x5f,0x01,0x01 +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x5f,0x01,0x01] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x32,0x01,0x60,0x01,0x13 +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x32,0x01,0x60,0x01,0x13] -# GFX12: v_lshrrev_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x33,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x33,0xff,0x6f,0x0d,0x30 +# GFX12: v_lshrrev_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x33,0xff,0x6f,0x0d,0x30] -# GFX12: v_max_num_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x1b,0x00,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x1b,0x00,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x40,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x40,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x41,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x41,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x01,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x01,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x0f,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x0f,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x11,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x11,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x1f,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x1f,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x21,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x21,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x2f,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x2f,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x62,0x01,0x50,0x01,0xff +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x50,0x01,0xff] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x62,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x5f,0x01,0x01] -# GFX12: v_max_num_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x62,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x62,0x01,0x60,0x01,0x13] -# GFX12: v_max_num_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x62,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x62,0x7f,0x6f,0xfd,0x30 +# GFX12-REAL16: v_max_num_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x62,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_max_num_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x62,0x7f,0x6f,0xfd,0x30] -# GFX12: v_max_num_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x1b,0x00,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x1b,0x00,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x40,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x40,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x41,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x41,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x01,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x01,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x0f,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x0f,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x11,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x11,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x1f,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x1f,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x21,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x21,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x2f,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x2f,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x2c,0x01,0x50,0x01,0xff +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x50,0x01,0xff] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x2c,0x01,0x5f,0x01,0x01 +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x5f,0x01,0x01] -# GFX12: v_max_num_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x2c,0x01,0x60,0x01,0x13 +# GFX12: v_max_num_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x2c,0x01,0x60,0x01,0x13] -# GFX12: v_max_num_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x2d,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x2d,0xff,0x6f,0xfd,0x30 +# GFX12: v_max_num_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x2d,0xff,0x6f,0xfd,0x30] -# GFX12: v_max_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x24,0x01,0x50,0x01,0xff +# GFX12: v_max_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x50,0x01,0xff] -# GFX12: v_max_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x24,0x01,0x5f,0x01,0x01 +# GFX12: v_max_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x5f,0x01,0x01] -# GFX12: v_max_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x24,0x01,0x60,0x01,0x13 +# GFX12: v_max_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x24,0x01,0x60,0x01,0x13] -# GFX12: v_max_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x25,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x25,0xff,0x6f,0x0d,0x30 +# GFX12: v_max_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x25,0xff,0x6f,0x0d,0x30] -# GFX12: v_max_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x28,0x01,0x50,0x01,0xff +# GFX12: v_max_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x50,0x01,0xff] -# GFX12: v_max_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x28,0x01,0x5f,0x01,0x01 +# GFX12: v_max_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x5f,0x01,0x01] -# GFX12: v_max_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x28,0x01,0x60,0x01,0x13 +# GFX12: v_max_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x28,0x01,0x60,0x01,0x13] -# GFX12: v_max_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x29,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x29,0xff,0x6f,0x0d,0x30 +# GFX12: v_max_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x29,0xff,0x6f,0x0d,0x30] -# GFX12: v_min_num_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x1b,0x00,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x1b,0x00,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x40,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x40,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x41,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x41,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x01,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x01,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x0f,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x0f,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x11,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x11,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x1f,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x1f,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x21,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x21,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x2f,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x2f,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x60,0x01,0x50,0x01,0xff +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x50,0x01,0xff] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x60,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x5f,0x01,0x01] -# GFX12: v_min_num_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x60,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x60,0x01,0x60,0x01,0x13] -# GFX12: v_min_num_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x60,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x60,0x7f,0x6f,0xfd,0x30 +# GFX12-REAL16: v_min_num_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x60,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_min_num_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x60,0x7f,0x6f,0xfd,0x30] -# GFX12: v_min_num_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x1b,0x00,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x1b,0x00,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x40,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x40,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x41,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x41,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x01,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x01,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x0f,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x0f,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x11,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x11,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x1f,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x1f,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x21,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x21,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x2f,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x2f,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x2a,0x01,0x50,0x01,0xff +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x50,0x01,0xff] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x2a,0x01,0x5f,0x01,0x01 +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x5f,0x01,0x01] -# GFX12: v_min_num_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x2a,0x01,0x60,0x01,0x13 +# GFX12: v_min_num_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x2a,0x01,0x60,0x01,0x13] -# GFX12: v_min_num_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x2b,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x2b,0xff,0x6f,0xfd,0x30 +# GFX12: v_min_num_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x2b,0xff,0x6f,0xfd,0x30] -# GFX12: v_min_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x22,0x01,0x50,0x01,0xff +# GFX12: v_min_i32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x50,0x01,0xff] -# GFX12: v_min_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x22,0x01,0x5f,0x01,0x01 +# GFX12: v_min_i32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x5f,0x01,0x01] -# GFX12: v_min_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x22,0x01,0x60,0x01,0x13 +# GFX12: v_min_i32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x22,0x01,0x60,0x01,0x13] -# GFX12: v_min_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x23,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x23,0xff,0x6f,0x0d,0x30 +# GFX12: v_min_i32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x23,0xff,0x6f,0x0d,0x30] -# GFX12: v_min_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x26,0x01,0x50,0x01,0xff +# GFX12: v_min_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x50,0x01,0xff] -# GFX12: v_min_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x26,0x01,0x5f,0x01,0x01 +# GFX12: v_min_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x5f,0x01,0x01] -# GFX12: v_min_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x26,0x01,0x60,0x01,0x13 +# GFX12: v_min_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x26,0x01,0x60,0x01,0x13] -# GFX12: v_min_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x27,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x27,0xff,0x6f,0x0d,0x30 +# GFX12: v_min_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x27,0xff,0x6f,0x0d,0x30] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x50,0x01,0xff] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01 +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x5f,0x01,0x01] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x0e,0x01,0x60,0x01,0x13 +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x0e,0x01,0x60,0x01,0x13] -# GFX12: v_mul_dx9_zero_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xfd,0x30 +# GFX12: v_mul_dx9_zero_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x0f,0xff,0x6f,0xfd,0x30] -# GFX12: v_mul_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1b,0x00,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0xe4,0x00,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x40,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x41,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x01,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x0f,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x11,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x1f,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x21,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x2f,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x50,0x01,0xff] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x5f,0x01,0x01] -# GFX12: v_mul_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x6a,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x6a,0x01,0x60,0x01,0x13] -# GFX12: v_mul_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xfd,0x30 +# GFX12-REAL16: v_mul_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_mul_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x6a,0x7f,0x6f,0xfd,0x30] -# GFX12: v_mul_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x10,0x01,0x50,0x01,0xff +# GFX12: v_mul_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x50,0x01,0xff] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x10,0x01,0x5f,0x01,0x01 +# GFX12: v_mul_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x5f,0x01,0x01] -# GFX12: v_mul_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x10,0x01,0x60,0x01,0x13 +# GFX12: v_mul_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x10,0x01,0x60,0x01,0x13] -# GFX12: v_mul_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x11,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x11,0xff,0x6f,0xfd,0x30 +# GFX12: v_mul_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x11,0xff,0x6f,0xfd,0x30] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x14,0x01,0x50,0x01,0xff +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x50,0x01,0xff] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x14,0x01,0x5f,0x01,0x01 +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x5f,0x01,0x01] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x14,0x01,0x60,0x01,0x13 +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x14,0x01,0x60,0x01,0x13] -# GFX12: v_mul_hi_i32_i24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x15,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x15,0xff,0x6f,0x0d,0x30 +# GFX12: v_mul_hi_i32_i24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x15,0xff,0x6f,0x0d,0x30] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x18,0x01,0x50,0x01,0xff +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x50,0x01,0xff] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x18,0x01,0x5f,0x01,0x01 +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x5f,0x01,0x01] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x18,0x01,0x60,0x01,0x13 +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x18,0x01,0x60,0x01,0x13] -# GFX12: v_mul_hi_u32_u24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x19,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x19,0xff,0x6f,0x0d,0x30 +# GFX12: v_mul_hi_u32_u24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x19,0xff,0x6f,0x0d,0x30] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x12,0x01,0x50,0x01,0xff +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x50,0x01,0xff] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x12,0x01,0x5f,0x01,0x01 +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x5f,0x01,0x01] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x12,0x01,0x60,0x01,0x13 +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x12,0x01,0x60,0x01,0x13] -# GFX12: v_mul_i32_i24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x13,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x13,0xff,0x6f,0x0d,0x30 +# GFX12: v_mul_i32_i24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x13,0xff,0x6f,0x0d,0x30] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x16,0x01,0x50,0x01,0xff +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x50,0x01,0xff] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x16,0x01,0x5f,0x01,0x01 +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x5f,0x01,0x01] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x16,0x01,0x60,0x01,0x13 +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x16,0x01,0x60,0x01,0x13] -# GFX12: v_mul_u32_u24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x17,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x17,0xff,0x6f,0x0d,0x30 +# GFX12: v_mul_u32_u24_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x17,0xff,0x6f,0x0d,0x30] -# GFX12: v_or_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x38,0x01,0x50,0x01,0xff +# GFX12: v_or_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x50,0x01,0xff] -# GFX12: v_or_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x38,0x01,0x5f,0x01,0x01 +# GFX12: v_or_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x5f,0x01,0x01] -# GFX12: v_or_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x38,0x01,0x60,0x01,0x13 +# GFX12: v_or_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x38,0x01,0x60,0x01,0x13] -# GFX12: v_or_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x39,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x39,0xff,0x6f,0x0d,0x30 +# GFX12: v_or_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x39,0xff,0x6f,0x0d,0x30] +0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0xff +0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff] -0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xff +0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff] -0xfa,0x04,0x0a,0x42,0x01,0x50,0x01,0xff +0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01 # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01] -0xfa,0x04,0x0a,0x42,0x01,0x5f,0x01,0x01 +0xfa,0x04,0x0a,0x42,0x01,0x60,0x01,0x13 # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x60,0x01,0x13] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x42,0x01,0x60,0x01,0x13] -0xfa,0x04,0x0a,0x42,0x01,0x60,0x01,0x13 +0xfa,0xfe,0xff,0x43,0xff,0x6f,0x0d,0x30 # W32: v_sub_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x43,0xff,0x6f,0x0d,0x30] # W64: v_sub_co_ci_u32_dpp v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x43,0xff,0x6f,0x0d,0x30] -0xfa,0xfe,0xff,0x43,0xff,0x6f,0x0d,0x30 -# GFX12: v_sub_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x50,0x01,0xff] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x5f,0x01,0x01] -# GFX12: v_sub_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x66,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x66,0x01,0x60,0x01,0x13] -# GFX12: v_sub_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xfd,0x30 +# GFX12-REAL16: v_sub_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_sub_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x66,0x7f,0x6f,0xfd,0x30] -# GFX12: v_sub_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x08,0x01,0x50,0x01,0xff +# GFX12: v_sub_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x50,0x01,0xff] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x08,0x01,0x5f,0x01,0x01 +# GFX12: v_sub_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x5f,0x01,0x01] -# GFX12: v_sub_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x08,0x01,0x60,0x01,0x13 +# GFX12: v_sub_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x08,0x01,0x60,0x01,0x13] -# GFX12: v_sub_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x09,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x09,0xff,0x6f,0xfd,0x30 +# GFX12: v_sub_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x09,0xff,0x6f,0xfd,0x30] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x4c,0x01,0x50,0x01,0xff +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x50,0x01,0xff] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x4c,0x01,0x5f,0x01,0x01 +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x5f,0x01,0x01] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x4c,0x01,0x60,0x01,0x13 +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4c,0x01,0x60,0x01,0x13] -# GFX12: v_sub_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4d,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x4d,0xff,0x6f,0x0d,0x30 +# GFX12: v_sub_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4d,0xff,0x6f,0x0d,0x30] +0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0xff +0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff] -0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xff +0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff] -0xfa,0x04,0x0a,0x44,0x01,0x50,0x01,0xff +0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01 # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01] -0xfa,0x04,0x0a,0x44,0x01,0x5f,0x01,0x01 +0xfa,0x04,0x0a,0x44,0x01,0x60,0x01,0x13 # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x60,0x01,0x13] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x44,0x01,0x60,0x01,0x13] -0xfa,0x04,0x0a,0x44,0x01,0x60,0x01,0x13 +0xfa,0xfe,0xff,0x45,0xff,0x6f,0x0d,0x30 # W32: v_subrev_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x45,0xff,0x6f,0x0d,0x30] # W64: v_subrev_co_ci_u32_dpp v255, vcc, v255, v255, vcc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x45,0xff,0x6f,0x0d,0x30] -0xfa,0xfe,0xff,0x45,0xff,0x6f,0x0d,0x30 -# GFX12: v_subrev_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1b,0x00,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0xe4,0x00,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x40,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x41,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x01,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x0f,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x11,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x1f,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x21,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x2f,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x50,0x01,0xff] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01 +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x5f,0x01,0x01] -# GFX12: v_subrev_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x68,0x01,0x60,0x01,0x13 +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x60,0x01,0x13] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x68,0x01,0x60,0x01,0x13] -# GFX12: v_subrev_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xfd,0x30] 0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xfd,0x30 +# GFX12-REAL16: v_subrev_f16_dpp v127.l, -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xfd,0x30] +# GFX12-FAKE16: v_subrev_f16_dpp v127, -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x68,0x7f,0x6f,0xfd,0x30] -# GFX12: v_subrev_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x0a,0x01,0x50,0x01,0xff +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x50,0x01,0xff] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x0a,0x01,0x5f,0x01,0x01 +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x5f,0x01,0x01] -# GFX12: v_subrev_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x0a,0x01,0x60,0x01,0x13 +# GFX12: v_subrev_f32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x0a,0x01,0x60,0x01,0x13] -# GFX12: v_subrev_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x0b,0xff,0x6f,0xfd,0x30] 0xfa,0xfe,0xff,0x0b,0xff,0x6f,0xfd,0x30 +# GFX12: v_subrev_f32_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x0b,0xff,0x6f,0xfd,0x30] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x4e,0x01,0x50,0x01,0xff +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x50,0x01,0xff] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x4e,0x01,0x5f,0x01,0x01 +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x5f,0x01,0x01] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x4e,0x01,0x60,0x01,0x13 +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x4e,0x01,0x60,0x01,0x13] -# GFX12: v_subrev_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4f,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x4f,0xff,0x6f,0x0d,0x30 +# GFX12: v_subrev_nc_u32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x4f,0xff,0x6f,0x0d,0x30] -# GFX12: v_xnor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x3c,0x01,0x50,0x01,0xff +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x50,0x01,0xff] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x3c,0x01,0x5f,0x01,0x01 +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x5f,0x01,0x01] -# GFX12: v_xnor_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x3c,0x01,0x60,0x01,0x13 +# GFX12: v_xnor_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x3c,0x01,0x60,0x01,0x13] -# GFX12: v_xnor_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x3d,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x3d,0xff,0x6f,0x0d,0x30 +# GFX12: v_xnor_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x3d,0xff,0x6f,0x0d,0x30] -# GFX12: v_xor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x50,0x01,0xff] 0xfa,0x04,0x0a,0x3a,0x01,0x50,0x01,0xff +# GFX12: v_xor_b32_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x50,0x01,0xff] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x5f,0x01,0x01] 0xfa,0x04,0x0a,0x3a,0x01,0x5f,0x01,0x01 +# GFX12: v_xor_b32_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x5f,0x01,0x01] -# GFX12: v_xor_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x60,0x01,0x13] 0xfa,0x04,0x0a,0x3a,0x01,0x60,0x01,0x13 +# GFX12: v_xor_b32_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x3a,0x01,0x60,0x01,0x13] -# GFX12: v_xor_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x3b,0xff,0x6f,0x0d,0x30] 0xfa,0xfe,0xff,0x3b,0xff,0x6f,0x0d,0x30 +# GFX12: v_xor_b32_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xff,0x3b,0xff,0x6f,0x0d,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt index 2e33df35af1f3..bbf494c153fd3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop2_dpp8.txt @@ -1,244 +1,261 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64 %s +; NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,GFX12-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W32,GFX12-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX12,W64,GFX12-FAKE16 %s +0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05 # W32: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] # W64: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05] -0xe9,0x04,0x0a,0x40,0x01,0x77,0x39,0x05 +0xea,0xfe,0xff,0x41,0xff,0x00,0x00,0x00 # W32: v_add_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x41,0xff,0x00,0x00,0x00] # W64: v_add_co_ci_u32_dpp v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x41,0xff,0x00,0x00,0x00] -0xea,0xfe,0xff,0x41,0xff,0x00,0x00,0x00 -# GFX12: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_add_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_add_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x64,0x01,0x77,0x39,0x05] -# GFX12: v_add_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_add_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_add_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x64,0x7f,0x00,0x00,0x00] -# GFX12: v_add_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05 +# GFX12: v_add_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x06,0x01,0x77,0x39,0x05] -# GFX12: v_add_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x07,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x07,0xff,0x00,0x00,0x00 +# GFX12: v_add_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x07,0xff,0x00,0x00,0x00] -# GFX12: v_add_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05 +# GFX12: v_add_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4a,0x01,0x77,0x39,0x05] -# GFX12: v_add_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4b,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x4b,0xff,0x00,0x00,0x00 +# GFX12: v_add_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4b,0xff,0x00,0x00,0x00] -# GFX12: v_and_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x36,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x36,0x01,0x77,0x39,0x05 +# GFX12: v_and_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x36,0x01,0x77,0x39,0x05] -# GFX12: v_and_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x37,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x37,0xff,0x00,0x00,0x00 +# GFX12: v_and_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x37,0xff,0x00,0x00,0x00] -# GFX12: v_ashrrev_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x34,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x34,0x01,0x77,0x39,0x05 +# GFX12: v_ashrrev_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x34,0x01,0x77,0x39,0x05] -# GFX12: v_ashrrev_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x35,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x35,0xff,0x00,0x00,0x00 +# GFX12: v_ashrrev_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x35,0xff,0x00,0x00,0x00] +0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05 # W32: v_cndmask_b32_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] # W64: v_cndmask_b32_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05] -0xe9,0x04,0x0a,0x02,0x01,0x77,0x39,0x05 +0xea,0xfe,0xff,0x03,0xff,0x00,0x00,0x00 # W32: v_cndmask_b32_dpp v255, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x03,0xff,0x00,0x00,0x00] # W64: v_cndmask_b32_dpp v255, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x03,0xff,0x00,0x00,0x00] -0xea,0xfe,0xff,0x03,0xff,0x00,0x00,0x00 -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05 +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x5e,0x01,0x77,0x39,0x05] -# GFX12: v_cvt_pk_rtz_f16_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00 +# GFX12: v_cvt_pk_rtz_f16_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x5f,0xff,0x00,0x00,0x00] -# GFX12: v_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05 +# GFX12: v_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6c,0x01,0x77,0x39,0x05] -# GFX12: v_fmac_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00 +# GFX12: v_fmac_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6c,0x7f,0x00,0x00,0x00] -# GFX12: v_fmac_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05 +# GFX12: v_fmac_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x56,0x01,0x77,0x39,0x05] -# GFX12: v_fmac_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x57,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x57,0xff,0x00,0x00,0x00 +# GFX12: v_fmac_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x57,0xff,0x00,0x00,0x00] -# GFX12: v_ldexp_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_ldexp_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_ldexp_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x76,0x01,0x77,0x39,0x05] -# GFX12: v_ldexp_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_ldexp_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_ldexp_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x76,0x7f,0x00,0x00,0x00] -# GFX12: v_lshlrev_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x30,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x30,0x01,0x77,0x39,0x05 +# GFX12: v_lshlrev_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x30,0x01,0x77,0x39,0x05] -# GFX12: v_lshlrev_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x31,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x31,0xff,0x00,0x00,0x00 +# GFX12: v_lshlrev_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x31,0xff,0x00,0x00,0x00] -# GFX12: v_lshrrev_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x32,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x32,0x01,0x77,0x39,0x05 +# GFX12: v_lshrrev_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x32,0x01,0x77,0x39,0x05] -# GFX12: v_lshrrev_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x33,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x33,0xff,0x00,0x00,0x00 +# GFX12: v_lshrrev_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x33,0xff,0x00,0x00,0x00] -# GFX12: v_max_num_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x62,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x62,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_max_num_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x62,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_max_num_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x62,0x01,0x77,0x39,0x05] -# GFX12: v_max_num_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x62,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x62,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_max_num_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x62,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_max_num_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x62,0x7f,0x00,0x00,0x00] -# GFX12: v_max_num_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x2c,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x2c,0x01,0x77,0x39,0x05 +# GFX12: v_max_num_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x2c,0x01,0x77,0x39,0x05] -# GFX12: v_max_num_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x2d,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x2d,0xff,0x00,0x00,0x00 +# GFX12: v_max_num_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x2d,0xff,0x00,0x00,0x00] -# GFX12: v_max_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x24,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x24,0x01,0x77,0x39,0x05 +# GFX12: v_max_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x24,0x01,0x77,0x39,0x05] -# GFX12: v_max_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x25,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x25,0xff,0x00,0x00,0x00 +# GFX12: v_max_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x25,0xff,0x00,0x00,0x00] -# GFX12: v_max_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x28,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x28,0x01,0x77,0x39,0x05 +# GFX12: v_max_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x28,0x01,0x77,0x39,0x05] -# GFX12: v_max_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x29,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x29,0xff,0x00,0x00,0x00 +# GFX12: v_max_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x29,0xff,0x00,0x00,0x00] -# GFX12: v_min_num_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x60,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x60,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_min_num_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x60,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_min_num_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x60,0x01,0x77,0x39,0x05] -# GFX12: v_min_num_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x60,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x60,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_min_num_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x60,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_min_num_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x60,0x7f,0x00,0x00,0x00] -# GFX12: v_min_num_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x2a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x2a,0x01,0x77,0x39,0x05 +# GFX12: v_min_num_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x2a,0x01,0x77,0x39,0x05] -# GFX12: v_min_num_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x2b,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x2b,0xff,0x00,0x00,0x00 +# GFX12: v_min_num_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x2b,0xff,0x00,0x00,0x00] -# GFX12: v_min_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x22,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x22,0x01,0x77,0x39,0x05 +# GFX12: v_min_i32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x22,0x01,0x77,0x39,0x05] -# GFX12: v_min_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x23,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x23,0xff,0x00,0x00,0x00 +# GFX12: v_min_i32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x23,0xff,0x00,0x00,0x00] -# GFX12: v_min_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x26,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x26,0x01,0x77,0x39,0x05 +# GFX12: v_min_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x26,0x01,0x77,0x39,0x05] -# GFX12: v_min_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x27,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x27,0xff,0x00,0x00,0x00 +# GFX12: v_min_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x27,0xff,0x00,0x00,0x00] -# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05 +# GFX12: v_mul_dx9_zero_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x0e,0x01,0x77,0x39,0x05] -# GFX12: v_mul_dx9_zero_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00 +# GFX12: v_mul_dx9_zero_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x0f,0xff,0x00,0x00,0x00] -# GFX12: v_mul_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_mul_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_mul_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x6a,0x01,0x77,0x39,0x05] -# GFX12: v_mul_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_mul_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_mul_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x6a,0x7f,0x00,0x00,0x00] -# GFX12: v_mul_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x10,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x10,0x01,0x77,0x39,0x05 +# GFX12: v_mul_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x10,0x01,0x77,0x39,0x05] -# GFX12: v_mul_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x11,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x11,0xff,0x00,0x00,0x00 +# GFX12: v_mul_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x11,0xff,0x00,0x00,0x00] -# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x14,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x14,0x01,0x77,0x39,0x05 +# GFX12: v_mul_hi_i32_i24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x14,0x01,0x77,0x39,0x05] -# GFX12: v_mul_hi_i32_i24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x15,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x15,0xff,0x00,0x00,0x00 +# GFX12: v_mul_hi_i32_i24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x15,0xff,0x00,0x00,0x00] -# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x18,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x18,0x01,0x77,0x39,0x05 +# GFX12: v_mul_hi_u32_u24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x18,0x01,0x77,0x39,0x05] -# GFX12: v_mul_hi_u32_u24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x19,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x19,0xff,0x00,0x00,0x00 +# GFX12: v_mul_hi_u32_u24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x19,0xff,0x00,0x00,0x00] -# GFX12: v_mul_i32_i24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x12,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x12,0x01,0x77,0x39,0x05 +# GFX12: v_mul_i32_i24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x12,0x01,0x77,0x39,0x05] -# GFX12: v_mul_i32_i24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x13,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x13,0xff,0x00,0x00,0x00 +# GFX12: v_mul_i32_i24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x13,0xff,0x00,0x00,0x00] -# GFX12: v_mul_u32_u24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x16,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x16,0x01,0x77,0x39,0x05 +# GFX12: v_mul_u32_u24_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x16,0x01,0x77,0x39,0x05] -# GFX12: v_mul_u32_u24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x17,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x17,0xff,0x00,0x00,0x00 +# GFX12: v_mul_u32_u24_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x17,0xff,0x00,0x00,0x00] -# GFX12: v_or_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x38,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x38,0x01,0x77,0x39,0x05 +# GFX12: v_or_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x38,0x01,0x77,0x39,0x05] -# GFX12: v_or_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x39,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x39,0xff,0x00,0x00,0x00 +# GFX12: v_or_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x39,0xff,0x00,0x00,0x00] +0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05 # W32: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] # W64: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05] -0xe9,0x04,0x0a,0x42,0x01,0x77,0x39,0x05 +0xea,0xfe,0xff,0x43,0xff,0x00,0x00,0x00 # W32: v_sub_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x43,0xff,0x00,0x00,0x00] # W64: v_sub_co_ci_u32_dpp v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x43,0xff,0x00,0x00,0x00] -0xea,0xfe,0xff,0x43,0xff,0x00,0x00,0x00 -# GFX12: v_sub_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_sub_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_sub_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x66,0x01,0x77,0x39,0x05] -# GFX12: v_sub_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_sub_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_sub_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x66,0x7f,0x00,0x00,0x00] -# GFX12: v_sub_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x08,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x08,0x01,0x77,0x39,0x05 +# GFX12: v_sub_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x08,0x01,0x77,0x39,0x05] -# GFX12: v_sub_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x09,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x09,0xff,0x00,0x00,0x00 +# GFX12: v_sub_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x09,0xff,0x00,0x00,0x00] -# GFX12: v_sub_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05 +# GFX12: v_sub_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4c,0x01,0x77,0x39,0x05] -# GFX12: v_sub_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4d,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x4d,0xff,0x00,0x00,0x00 +# GFX12: v_sub_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4d,0xff,0x00,0x00,0x00] +0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05 # W32: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] # W64: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05] -0xe9,0x04,0x0a,0x44,0x01,0x77,0x39,0x05 +0xea,0xfe,0xff,0x45,0xff,0x00,0x00,0x00 # W32: v_subrev_co_ci_u32_dpp v255, vcc_lo, v255, v255, vcc_lo dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x45,0xff,0x00,0x00,0x00] # W64: v_subrev_co_ci_u32_dpp v255, vcc, v255, v255, vcc dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x45,0xff,0x00,0x00,0x00] -0xea,0xfe,0xff,0x45,0xff,0x00,0x00,0x00 -# GFX12: v_subrev_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05 +# GFX12-REAL16: v_subrev_f16_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] +# GFX12-FAKE16: v_subrev_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x68,0x01,0x77,0x39,0x05] -# GFX12: v_subrev_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00] 0xea,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00 +# GFX12-REAL16: v_subrev_f16_dpp v127.l, v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00] +# GFX12-FAKE16: v_subrev_f16_dpp v127, v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfe,0x68,0x7f,0x00,0x00,0x00] -# GFX12: v_subrev_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05 +# GFX12: v_subrev_f32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x0a,0x01,0x77,0x39,0x05] -# GFX12: v_subrev_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x0b,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x0b,0xff,0x00,0x00,0x00 +# GFX12: v_subrev_f32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x0b,0xff,0x00,0x00,0x00] -# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05 +# GFX12: v_subrev_nc_u32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] -# GFX12: v_subrev_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4f,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x4f,0xff,0x00,0x00,0x00 +# GFX12: v_subrev_nc_u32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x4f,0xff,0x00,0x00,0x00] -# GFX12: v_xnor_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05 +# GFX12: v_xnor_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x3c,0x01,0x77,0x39,0x05] -# GFX12: v_xnor_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x3d,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x3d,0xff,0x00,0x00,0x00 +# GFX12: v_xnor_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x3d,0xff,0x00,0x00,0x00] -# GFX12: v_xor_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05] 0xe9,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05 +# GFX12: v_xor_b32_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x3a,0x01,0x77,0x39,0x05] -# GFX12: v_xor_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x3b,0xff,0x00,0x00,0x00] 0xea,0xfe,0xff,0x3b,0xff,0x00,0x00,0x00 +# GFX12: v_xor_b32_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xff,0x3b,0xff,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/X86/AMX/amx-fp8.txt b/llvm/test/MC/Disassembler/X86/AMX/amx-fp8.txt new file mode 100644 index 0000000000000..e714a52d2c31a --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/AMX/amx-fp8.txt @@ -0,0 +1,34 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: tdpbf8ps %tmm4, %tmm5, %tmm6 +# INTEL: tdpbf8ps tmm6, tmm5, tmm4 +0xc4,0xe5,0x58,0xfd,0xf5 + +# ATT: tdpbf8ps %tmm1, %tmm2, %tmm3 +# INTEL: tdpbf8ps tmm3, tmm2, tmm1 +0xc4,0xe5,0x70,0xfd,0xda + +# ATT: tdpbhf8ps %tmm4, %tmm5, %tmm6 +# INTEL: tdpbhf8ps tmm6, tmm5, tmm4 +0xc4,0xe5,0x5b,0xfd,0xf5 + +# ATT: tdpbhf8ps %tmm1, %tmm2, %tmm3 +# INTEL: tdpbhf8ps tmm3, tmm2, tmm1 +0xc4,0xe5,0x73,0xfd,0xda + +# ATT: tdphbf8ps %tmm4, %tmm5, %tmm6 +# INTEL: tdphbf8ps tmm6, tmm5, tmm4 +0xc4,0xe5,0x5a,0xfd,0xf5 + +# ATT: tdphbf8ps %tmm1, %tmm2, %tmm3 +# INTEL: tdphbf8ps tmm3, tmm2, tmm1 +0xc4,0xe5,0x72,0xfd,0xda + +# ATT: tdphf8ps %tmm4, %tmm5, %tmm6 +# INTEL: tdphf8ps tmm6, tmm5, tmm4 +0xc4,0xe5,0x59,0xfd,0xf5 + +# ATT: tdphf8ps %tmm1, %tmm2, %tmm3 +# INTEL: tdphf8ps tmm3, tmm2, tmm1 +0xc4,0xe5,0x71,0xfd,0xda diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.s b/llvm/test/MC/Disassembler/X86/amx-transpose-att.s new file mode 100644 index 0000000000000..da3fa95ef6dd0 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.s @@ -0,0 +1,57 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 +// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz0 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4 +// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm5 + +// CHECK: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4 +// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm5 + +// CHECK: t2rpntlvwz1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm2 +// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0x94,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm3 + +// CHECK: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 +// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 + +// CHECK: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 + +// CHECK: ttransposed %tmm1, %tmm5 +// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] + ttransposed %tmm1, %tmm5 + +// CHECK: ttransposed %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda] + ttransposed %tmm2, %tmm3 diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt new file mode 100644 index 0000000000000..e4f1689639ef9 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt @@ -0,0 +1,58 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4 +# INTEL: t2rpntlvwz0 tmm4, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] +0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32] +0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4 +# INTEL: t2rpntlvwz0t1 tmm4, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32] +0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4 +# INTEL: t2rpntlvwz1 tmm4, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32] +0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm4 +# INTEL: t2rpntlvwz1t1 tmm4, [rbp + 8*r14 + 268435456] +0xc4,0xa2,0x79,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2 +# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] +0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] +0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: ttransposed %tmm1, %tmm2 +# INTEL: ttransposed tmm2, tmm1 +0xc4,0xe2,0x7a,0x5f,0xd1 + +# ATT: ttransposed %tmm2, %tmm3 +# INTEL: ttransposed tmm3, tmm2 +0xc4,0xe2,0x7a,0x5f,0xda diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s b/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s new file mode 100644 index 0000000000000..3b8dfaed313d6 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s @@ -0,0 +1,57 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0t1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz0t1 tmm7, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz0t1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz0t1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1 tmm0, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0x84,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1 tmm1, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10] + t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] +// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00] + t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291] + +// CHECK: t2rpntlvwz1t1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + t2rpntlvwz1t1 tmm2, [2*rbp - 32] + +// CHECK: ttransposed tmm5, tmm1 +// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] + ttransposed tmm5, tmm1 + +// CHECK: ttransposed tmm3, tmm2 +// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda] + ttransposed tmm3, tmm2 diff --git a/llvm/test/MC/X86/AMX/amx-fp8-att.s b/llvm/test/MC/X86/AMX/amx-fp8-att.s new file mode 100644 index 0000000000000..904539ec4917f --- /dev/null +++ b/llvm/test/MC/X86/AMX/amx-fp8-att.s @@ -0,0 +1,33 @@ +// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s + +// CHECK: tdpbf8ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x58,0xfd,0xf5] + tdpbf8ps %tmm4, %tmm5, %tmm6 + +// CHECK: tdpbf8ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe5,0x70,0xfd,0xda] + tdpbf8ps %tmm1, %tmm2, %tmm3 + +// CHECK: tdpbhf8ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x5b,0xfd,0xf5] + tdpbhf8ps %tmm4, %tmm5, %tmm6 + +// CHECK: tdpbhf8ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe5,0x73,0xfd,0xda] + tdpbhf8ps %tmm1, %tmm2, %tmm3 + +// CHECK: tdphbf8ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x5a,0xfd,0xf5] + tdphbf8ps %tmm4, %tmm5, %tmm6 + +// CHECK: tdphbf8ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe5,0x72,0xfd,0xda] + tdphbf8ps %tmm1, %tmm2, %tmm3 + +// CHECK: tdphf8ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x59,0xfd,0xf5] + tdphf8ps %tmm4, %tmm5, %tmm6 + +// CHECK: tdphf8ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe5,0x71,0xfd,0xda] + tdphf8ps %tmm1, %tmm2, %tmm3 diff --git a/llvm/test/MC/X86/AMX/amx-fp8-intel.s b/llvm/test/MC/X86/AMX/amx-fp8-intel.s new file mode 100644 index 0000000000000..4191ae6f5cd13 --- /dev/null +++ b/llvm/test/MC/X86/AMX/amx-fp8-intel.s @@ -0,0 +1,33 @@ +// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: tdpbf8ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe5,0x58,0xfd,0xf5] + tdpbf8ps tmm6, tmm5, tmm4 + +// CHECK: tdpbf8ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe5,0x70,0xfd,0xda] + tdpbf8ps tmm3, tmm2, tmm1 + +// CHECK: tdpbhf8ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe5,0x5b,0xfd,0xf5] + tdpbhf8ps tmm6, tmm5, tmm4 + +// CHECK: tdpbhf8ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe5,0x73,0xfd,0xda] + tdpbhf8ps tmm3, tmm2, tmm1 + +// CHECK: tdphbf8ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe5,0x5a,0xfd,0xf5] + tdphbf8ps tmm6, tmm5, tmm4 + +// CHECK: tdphbf8ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe5,0x72,0xfd,0xda] + tdphbf8ps tmm3, tmm2, tmm1 + +// CHECK: tdphf8ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe5,0x59,0xfd,0xf5] + tdphf8ps tmm6, tmm5, tmm4 + +// CHECK: tdphf8ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe5,0x71,0xfd,0xda] + tdphf8ps tmm3, tmm2, tmm1 diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll index d3fb9d8ee522e..69cf9697d4b30 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll @@ -187,7 +187,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -204,7 +204,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -221,7 +221,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -238,7 +238,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -260,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -292,7 +292,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -309,7 +309,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -326,7 +326,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -343,7 +343,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -365,7 +365,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -382,7 +382,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -409,7 +409,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -426,7 +426,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -443,7 +443,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -460,7 +460,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -482,7 +482,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -514,7 +514,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -531,7 +531,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -548,7 +548,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -565,7 +565,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -587,7 +587,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -619,7 +619,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -636,7 +636,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -653,7 +653,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -670,7 +670,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -692,7 +692,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -841,7 +841,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -858,7 +858,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -875,7 +875,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -902,7 +902,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -934,7 +934,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -951,7 +951,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -968,7 +968,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -985,7 +985,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1007,7 +1007,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1024,7 +1024,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1051,7 +1051,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1068,7 +1068,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1085,7 +1085,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1112,7 +1112,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1144,7 +1144,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1161,7 +1161,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1178,7 +1178,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1205,7 +1205,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1237,7 +1237,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1254,7 +1254,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1271,7 +1271,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1298,7 +1298,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1451,7 +1451,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1468,7 +1468,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1485,7 +1485,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1502,7 +1502,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1524,7 +1524,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1556,7 +1556,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1573,7 +1573,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1590,7 +1590,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1607,7 +1607,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1629,7 +1629,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1646,7 +1646,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1673,7 +1673,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1690,7 +1690,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1707,7 +1707,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1724,7 +1724,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1746,7 +1746,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1778,7 +1778,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1795,7 +1795,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1812,7 +1812,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1829,7 +1829,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1851,7 +1851,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1883,7 +1883,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1900,7 +1900,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1917,7 +1917,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1934,7 +1934,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1956,7 +1956,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2105,7 +2105,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2122,7 +2122,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2154,7 +2154,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2186,7 +2186,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2203,7 +2203,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2220,7 +2220,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2237,7 +2237,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2259,7 +2259,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2276,7 +2276,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2303,7 +2303,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2320,7 +2320,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2352,7 +2352,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2384,7 +2384,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2401,7 +2401,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2433,7 +2433,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2465,7 +2465,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2482,7 +2482,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2514,7 +2514,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_ ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2572,7 +2572,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memor ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2594,7 +2594,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(ptr ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2616,7 +2616,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memor ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2660,7 +2660,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_ ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2682,7 +2682,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_ ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2704,7 +2704,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_ ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2859,7 +2859,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2876,7 +2876,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2893,7 +2893,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2910,7 +2910,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2927,7 +2927,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2964,7 +2964,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2981,7 +2981,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2998,7 +2998,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3015,7 +3015,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3032,7 +3032,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3049,7 +3049,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -3066,7 +3066,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -3093,7 +3093,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3110,7 +3110,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3127,7 +3127,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3144,7 +3144,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3161,7 +3161,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3327,7 +3327,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3344,7 +3344,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3361,7 +3361,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3378,7 +3378,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3395,7 +3395,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3432,7 +3432,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3449,7 +3449,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3466,7 +3466,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3483,7 +3483,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3500,7 +3500,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3517,7 +3517,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -3534,7 +3534,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -3561,7 +3561,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3578,7 +3578,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3595,7 +3595,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3612,7 +3612,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3629,7 +3629,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3799,7 +3799,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3816,7 +3816,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3833,7 +3833,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3850,7 +3850,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3867,7 +3867,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3904,7 +3904,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3921,7 +3921,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3938,7 +3938,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3955,7 +3955,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3972,7 +3972,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3989,7 +3989,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -4006,7 +4006,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -4033,7 +4033,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4050,7 +4050,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4067,7 +4067,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4084,7 +4084,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4101,7 +4101,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4267,7 +4267,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4284,7 +4284,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4301,7 +4301,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4318,7 +4318,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4335,7 +4335,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4372,7 +4372,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4389,7 +4389,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4406,7 +4406,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4423,7 +4423,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4440,7 +4440,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4457,7 +4457,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -4474,7 +4474,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -4501,7 +4501,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4518,7 +4518,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4535,7 +4535,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4552,7 +4552,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -4569,7 +4569,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_ ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll index d48e7317abb5d..0976022825ced 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll @@ -92,7 +92,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -109,7 +109,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -126,7 +126,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -143,7 +143,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -165,7 +165,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -197,7 +197,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -214,7 +214,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -231,7 +231,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -248,7 +248,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -270,7 +270,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -287,7 +287,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -314,7 +314,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -331,7 +331,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -348,7 +348,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -365,7 +365,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -387,7 +387,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -419,7 +419,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -436,7 +436,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -453,7 +453,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -470,7 +470,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -492,7 +492,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -524,7 +524,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -541,7 +541,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -558,7 +558,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -575,7 +575,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -597,7 +597,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -651,7 +651,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -668,7 +668,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -685,7 +685,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -712,7 +712,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -744,7 +744,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -761,7 +761,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -778,7 +778,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -795,7 +795,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -817,7 +817,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -834,7 +834,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -861,7 +861,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -878,7 +878,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -895,7 +895,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -922,7 +922,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -954,7 +954,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -971,7 +971,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -988,7 +988,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1015,7 +1015,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1047,7 +1047,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1064,7 +1064,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1081,7 +1081,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1108,7 +1108,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1166,7 +1166,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1183,7 +1183,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1200,7 +1200,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1217,7 +1217,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1239,7 +1239,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1271,7 +1271,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1288,7 +1288,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1305,7 +1305,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1322,7 +1322,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1344,7 +1344,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1361,7 +1361,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1388,7 +1388,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1405,7 +1405,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1422,7 +1422,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1439,7 +1439,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1461,7 +1461,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1493,7 +1493,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1510,7 +1510,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1527,7 +1527,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1544,7 +1544,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1566,7 +1566,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1598,7 +1598,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1615,7 +1615,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1632,7 +1632,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1649,7 +1649,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1671,7 +1671,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1725,7 +1725,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1742,7 +1742,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1774,7 +1774,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1806,7 +1806,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1823,7 +1823,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1840,7 +1840,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1857,7 +1857,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1879,7 +1879,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1896,7 +1896,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1923,7 +1923,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1940,7 +1940,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1972,7 +1972,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2004,7 +2004,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2021,7 +2021,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2053,7 +2053,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2085,7 +2085,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2102,7 +2102,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2134,7 +2134,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2192,7 +2192,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2214,7 +2214,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory(ptr ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2236,7 +2236,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2280,7 +2280,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2302,7 +2302,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2324,7 +2324,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode ; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -2372,7 +2372,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2389,7 +2389,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2406,7 +2406,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2423,7 +2423,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2440,7 +2440,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2477,7 +2477,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2494,7 +2494,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2511,7 +2511,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2528,7 +2528,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2545,7 +2545,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2562,7 +2562,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2579,7 +2579,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2606,7 +2606,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2623,7 +2623,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2640,7 +2640,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2657,7 +2657,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2674,7 +2674,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2733,7 +2733,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2750,7 +2750,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2767,7 +2767,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2784,7 +2784,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2801,7 +2801,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2838,7 +2838,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2855,7 +2855,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2872,7 +2872,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2889,7 +2889,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2906,7 +2906,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2923,7 +2923,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2940,7 +2940,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -2967,7 +2967,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -2984,7 +2984,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3001,7 +3001,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3018,7 +3018,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3035,7 +3035,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3098,7 +3098,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3115,7 +3115,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3132,7 +3132,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3149,7 +3149,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3166,7 +3166,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3203,7 +3203,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3220,7 +3220,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3237,7 +3237,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3254,7 +3254,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3271,7 +3271,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3288,7 +3288,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -3305,7 +3305,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -3332,7 +3332,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3349,7 +3349,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3366,7 +3366,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3383,7 +3383,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3400,7 +3400,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3459,7 +3459,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3476,7 +3476,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3493,7 +3493,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3510,7 +3510,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3527,7 +3527,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3564,7 +3564,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3581,7 +3581,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3598,7 +3598,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3615,7 +3615,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3632,7 +3632,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3649,7 +3649,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -3666,7 +3666,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -3693,7 +3693,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3710,7 +3710,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3727,7 +3727,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3744,7 +3744,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float @@ -3761,7 +3761,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll index 19b02a364ac11..af9933fa9e726 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll @@ -199,7 +199,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -216,7 +216,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -233,7 +233,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -260,7 +260,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -277,7 +277,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -294,7 +294,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -316,7 +316,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -333,7 +333,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -350,7 +350,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -367,7 +367,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -389,7 +389,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -406,7 +406,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -423,7 +423,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -445,7 +445,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -462,7 +462,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -479,7 +479,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -506,7 +506,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -523,7 +523,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -540,7 +540,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -562,7 +562,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -579,7 +579,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -596,7 +596,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -623,7 +623,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -640,7 +640,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -657,7 +657,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -679,7 +679,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -696,7 +696,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -713,7 +713,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -740,7 +740,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -757,7 +757,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -774,7 +774,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1008,7 +1008,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memo ; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1030,7 +1030,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_remote_memory(ptr ; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1052,7 +1052,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memo ; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1331,7 +1331,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1348,7 +1348,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1365,7 +1365,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1397,7 +1397,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1414,7 +1414,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1436,7 +1436,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1453,7 +1453,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1470,7 +1470,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1487,7 +1487,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1509,7 +1509,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1526,7 +1526,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1543,7 +1543,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1565,7 +1565,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1582,7 +1582,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1599,7 +1599,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1631,7 +1631,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1648,7 +1648,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1927,7 +1927,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1944,7 +1944,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1961,7 +1961,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1993,7 +1993,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2010,7 +2010,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2032,7 +2032,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2049,7 +2049,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2066,7 +2066,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2083,7 +2083,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -2105,7 +2105,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -2122,7 +2122,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2139,7 +2139,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2161,7 +2161,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2178,7 +2178,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2195,7 +2195,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2227,7 +2227,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -2244,7 +2244,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll index e56417167c33b..d01dd2eb29538 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll @@ -92,7 +92,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -109,7 +109,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -126,7 +126,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -153,7 +153,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -170,7 +170,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -187,7 +187,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -209,7 +209,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -226,7 +226,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -243,7 +243,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -260,7 +260,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt ; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -282,7 +282,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -299,7 +299,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -316,7 +316,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -338,7 +338,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -355,7 +355,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -372,7 +372,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -399,7 +399,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -416,7 +416,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -433,7 +433,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -455,7 +455,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -472,7 +472,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -489,7 +489,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -516,7 +516,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -533,7 +533,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -550,7 +550,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -572,7 +572,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -589,7 +589,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -606,7 +606,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -633,7 +633,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -650,7 +650,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -667,7 +667,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -901,7 +901,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem ; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -923,7 +923,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory(pt ; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -945,7 +945,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem ; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1117,7 +1117,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1134,7 +1134,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1151,7 +1151,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1183,7 +1183,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1200,7 +1200,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1222,7 +1222,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1239,7 +1239,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1256,7 +1256,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1273,7 +1273,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt ; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1295,7 +1295,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt ; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1312,7 +1312,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1329,7 +1329,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1351,7 +1351,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1368,7 +1368,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1385,7 +1385,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1417,7 +1417,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1434,7 +1434,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1606,7 +1606,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1623,7 +1623,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1640,7 +1640,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1672,7 +1672,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1689,7 +1689,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1711,7 +1711,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1728,7 +1728,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1745,7 +1745,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1762,7 +1762,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt ; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1784,7 +1784,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt ; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1801,7 +1801,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1818,7 +1818,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1840,7 +1840,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1857,7 +1857,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1874,7 +1874,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1906,7 +1906,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1923,7 +1923,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem ; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll index 5dbf2f6e696e1..c3a0a4192ff17 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll @@ -228,7 +228,7 @@ define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory( ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -248,7 +248,7 @@ define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_remote_memory(ptr ad ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -268,7 +268,7 @@ define i32 @test_atomicrmw_nand_i32_global_agent__amdgpu_no_fine_grained_memory_ ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll index 175f75634e706..be3aaeb170673 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll @@ -228,7 +228,7 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -248,7 +248,7 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory(ptr a ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -268,7 +268,7 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -403,7 +403,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -423,7 +423,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory(ptr ad ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -443,7 +443,7 @@ define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory_ ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -488,7 +488,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -508,7 +508,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory(ptr ad ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -528,7 +528,7 @@ define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory_ ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -573,7 +573,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -593,7 +593,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory(ptr a ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -613,7 +613,7 @@ define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -658,7 +658,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -678,7 +678,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory(ptr a ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -698,7 +698,7 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll index bd37f5ba88c6b..77fe5e2aba913 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll @@ -228,7 +228,7 @@ define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory( ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -248,7 +248,7 @@ define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_remote_memory(ptr ad ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -268,7 +268,7 @@ define i64 @test_atomicrmw_nand_i64_global_agent__amdgpu_no_fine_grained_memory_ ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll index ecb898d120dd1..bd2aa846efb21 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll @@ -228,7 +228,7 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -248,7 +248,7 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory(ptr a ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -268,7 +268,7 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -403,7 +403,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -423,7 +423,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory(ptr ad ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -443,7 +443,7 @@ define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory_ ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -488,7 +488,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -508,7 +508,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory(ptr ad ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -528,7 +528,7 @@ define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory_ ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -573,7 +573,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -593,7 +593,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory(ptr a ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -613,7 +613,7 @@ define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -658,7 +658,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -678,7 +678,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory(ptr a ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] @@ -698,7 +698,7 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory ; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] ; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] -; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 ; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll index 3c5f3a09082a7..e79bb465563e8 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll @@ -126,12 +126,12 @@ define i16 @test_cmpxchg_i16_global_agent_align4(ptr addrspace(1) %out, i16 %in, define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX90A-LABEL: define void @syncscope_workgroup_nortn( -; GFX90A-SAME: ptr [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-SAME: ptr [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR0]] { ; GFX90A-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR]]) ; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] ; GFX90A: atomicrmw.shared: ; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(3) -; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !mmra [[META0]] +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]] ; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] ; GFX90A: atomicrmw.check.private: ; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[ADDR]]) @@ -144,7 +144,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.global: ; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) -; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !mmra [[META0]] +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]] ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.phi: ; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] @@ -152,8 +152,8 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX90A-NEXT: ret void ; ; GFX1100-LABEL: define void @syncscope_workgroup_nortn( -; GFX1100-SAME: ptr [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR1:[0-9]+]] { -; GFX1100-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !mmra [[META0]] +; GFX1100-SAME: ptr [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR0]] { +; GFX1100-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[ADDR]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !mmra [[META0]], !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]] ; GFX1100-NEXT: ret void ; %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst, !mmra !2, !amdgpu.no.fine.grained.memory !3, !amdgpu.ignore.denormal.mode !3 @@ -193,8 +193,10 @@ define i32 @atomic_load_global_align1(ptr addrspace(1) %ptr) { ; GFX90A: [[META0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]]} ; GFX90A: [[META1]] = !{!"foo", !"bar"} ; GFX90A: [[META2]] = !{!"bux", !"baz"} +; GFX90A: [[META3]] = !{} ;. ; GFX1100: [[META0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]]} ; GFX1100: [[META1]] = !{!"foo", !"bar"} ; GFX1100: [[META2]] = !{!"bux", !"baz"} +; GFX1100: [[META3]] = !{} ;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll index e8b4e752d3a28..db01f221f2911 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll @@ -13,7 +13,7 @@ define float @syncscope_system(ptr %addr, float %val) { ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -83,7 +83,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) { ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("workgroup") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("workgroup") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -150,7 +150,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX908-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] ; GFX908: atomicrmw.shared: ; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(3) -; GFX908-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX908-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] ; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]] ; GFX908: atomicrmw.check.private: ; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[ADDR]]) @@ -163,7 +163,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX908-NEXT: br label [[ATOMICRMW_PHI]] ; GFX908: atomicrmw.global: ; GFX908-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) -; GFX908-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] ; GFX908-NEXT: br label [[ATOMICRMW_PHI]] ; GFX908: atomicrmw.phi: ; GFX908-NEXT: br label [[ATOMICRMW_END:%.*]] @@ -188,7 +188,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) { ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.global: ; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) -; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.phi: ; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] @@ -303,7 +303,7 @@ define float @flat_atomicrmw_fadd_f32__align32(ptr %addr, float %val) { ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 32, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 163f436ebc9bd..84d9a64efa0f7 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -282,7 +282,7 @@ define float @test_atomicrmw_fadd_f32_global_unsafe(ptr addrspace(1) %ptr, float ; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -298,7 +298,7 @@ define float @test_atomicrmw_fadd_f32_global_unsafe(ptr addrspace(1) %ptr, float ; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -314,7 +314,7 @@ define float @test_atomicrmw_fadd_f32_global_unsafe(ptr addrspace(1) %ptr, float ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -347,7 +347,7 @@ define float @test_atomicrmw_fadd_f32_buffer_fat_ptr_unsafe(ptr addrspace(7) %pt ; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -363,7 +363,7 @@ define float @test_atomicrmw_fadd_f32_buffer_fat_ptr_unsafe(ptr addrspace(7) %pt ; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -379,7 +379,7 @@ define float @test_atomicrmw_fadd_f32_buffer_fat_ptr_unsafe(ptr addrspace(7) %pt ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -412,7 +412,7 @@ define float @test_atomicrmw_fadd_f32_as999_unsafe(ptr addrspace(999) %ptr, floa ; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -428,7 +428,7 @@ define float @test_atomicrmw_fadd_f32_as999_unsafe(ptr addrspace(999) %ptr, floa ; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -444,7 +444,7 @@ define float @test_atomicrmw_fadd_f32_as999_unsafe(ptr addrspace(999) %ptr, floa ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(999) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -477,7 +477,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe(ptr addrspace(1) %ptr, doub ; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -493,7 +493,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe(ptr addrspace(1) %ptr, doub ; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -509,7 +509,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe(ptr addrspace(1) %ptr, doub ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -533,7 +533,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe(ptr addrspace(1) %ptr, doub ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -554,7 +554,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -570,7 +570,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -586,7 +586,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -599,7 +599,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] ; GFX90A: atomicrmw.shared: ; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) -; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] ; GFX90A: atomicrmw.check.private: ; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) @@ -612,7 +612,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.global: ; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) -; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("wavefront") monotonic, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.phi: ; GFX90A-NEXT: [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ] @@ -621,19 +621,19 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; GFX90A-NEXT: ret float [[RES]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe( -; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: ret float [[RES]] ; - %res = atomicrmw fadd ptr %ptr, float %value syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %res = atomicrmw fadd ptr %ptr, float %value syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret float %res } -define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { -; CI-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +define double @test_atomicrmw_fadd_f64_flat_unsafe__noprivate(ptr %ptr, double %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( ; CI-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: @@ -641,7 +641,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -649,7 +649,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; CI: atomicrmw.end: ; CI-NEXT: ret double [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX9-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( ; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: @@ -657,7 +657,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -665,7 +665,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret double [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX908-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( ; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: @@ -673,7 +673,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -681,15 +681,15 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX908: atomicrmw.end: ; GFX908-NEXT: ret double [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: ret double [[RES]] ; -; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: ret double [[RES]] ; -; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( ; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: @@ -697,13 +697,174 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: ; GFX11-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, double %value syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; CI-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; CI-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; CI: atomicrmw.private: +; CI-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CI-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; CI-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; CI: atomicrmw.global: +; CI-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end1: +; CI-NEXT: br label [[ATOMICRMW_PHI]] +; CI: atomicrmw.phi: +; CI-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; CI-NEXT: br label [[ATOMICRMW_END:%.*]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[RES]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX9-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX9-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX9: atomicrmw.private: +; GFX9-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX9-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX9-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX9: atomicrmw.global: +; GFX9-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end1: +; GFX9-NEXT: br label [[ATOMICRMW_PHI]] +; GFX9: atomicrmw.phi: +; GFX9-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX9-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[RES]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX908: atomicrmw.private: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX908-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX908: atomicrmw.global: +; GFX908-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end1: +; GFX908-NEXT: br label [[ATOMICRMW_PHI]] +; GFX908: atomicrmw.phi: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX90A: atomicrmw.private: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.global: +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.phi: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX940-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX940-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX940: atomicrmw.private: +; GFX940-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX940-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX940-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX940: atomicrmw.global: +; GFX940-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: br label [[ATOMICRMW_PHI]] +; GFX940: atomicrmw.phi: +; GFX940-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ] +; GFX940-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret double [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX11-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX11-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX11: atomicrmw.private: +; GFX11-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX11-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX11-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX11: atomicrmw.global: +; GFX11-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end1: +; GFX11-NEXT: br label [[ATOMICRMW_PHI]] +; GFX11: atomicrmw.phi: +; GFX11-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX11-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[RES]] ; %res = atomicrmw fadd ptr %ptr, double %value syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret double %res @@ -823,7 +984,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1) ; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -839,7 +1000,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1) ; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -855,7 +1016,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1) ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -871,7 +1032,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1) ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1067,6 +1228,41 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) { define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) { ; ALL-LABEL: @test_atomicrmw_fadd_f64_flat( +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; ALL: atomicrmw.private: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; ALL: atomicrmw.global: +; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]] +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end1: +; ALL-NEXT: br label [[ATOMICRMW_PHI]] +; ALL: atomicrmw.phi: +; ALL-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret double [[RES]] +; + %res = atomicrmw fadd ptr %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fadd_f64_flat__noprivate(ptr %ptr, double %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f64_flat__noprivate( ; ALL-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] ; ALL: atomicrmw.start: @@ -1074,7 +1270,7 @@ define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) { ; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] ; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !noalias.addrspace [[META1]] ; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1082,7 +1278,7 @@ define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) { ; ALL: atomicrmw.end: ; ALL-NEXT: ret double [[TMP5]] ; - %res = atomicrmw fadd ptr %ptr, double %value seq_cst + %res = atomicrmw fadd ptr %ptr, double %value seq_cst, !noalias.addrspace !1 ret double %res } @@ -1303,7 +1499,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp(ptr addrspace ; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8:[0-9]+]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1319,7 +1515,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp(ptr addrspace ; GFX9-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8:[0-9]+]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1328,7 +1524,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp(ptr addrspace ; GFX9-NEXT: ret void ; ; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: ret void ; ; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_no_use_unsafe_strictfp( @@ -1356,7 +1552,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; CI-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1372,7 +1568,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX9-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1388,7 +1584,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8:[0-9]+]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1412,7 +1608,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX11-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8:[0-9]+]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -2119,7 +2315,7 @@ define void @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode(pt ; CI-NEXT: ret void ; ; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( -; GFX9-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX9-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] ; GFX9-NEXT: ret void ; ; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( @@ -2619,18 +2815,31 @@ define float @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mo define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { ; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( -; ALL-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; ALL: atomicrmw.private: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; ALL: atomicrmw.global: +; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] ; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end1: +; ALL-NEXT: br label [[ATOMICRMW_PHI]] +; ALL: atomicrmw.phi: +; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] ; ALL: atomicrmw.end: ; ALL-NEXT: ret void ; @@ -2640,20 +2849,34 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_ define double @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { ; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( -; ALL-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; ALL: atomicrmw.private: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; ALL: atomicrmw.global: +; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] ; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8, !noalias.addrspace [[META1]] +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end1: +; ALL-NEXT: br label [[ATOMICRMW_PHI]] +; ALL: atomicrmw.phi: +; ALL-NEXT: [[RET:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] ; ALL: atomicrmw.end: -; ALL-NEXT: ret double [[TMP5]] +; ALL-NEXT: ret double [[RET]] ; %ret = atomicrmw fadd ptr %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 ret double %ret @@ -3998,7 +4221,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace( ; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4014,7 +4237,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace( ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4030,7 +4253,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace( ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4054,7 +4277,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace( ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4075,7 +4298,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace( ; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4091,7 +4314,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace( ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4119,7 +4342,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace( ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4140,7 +4363,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 ; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4156,7 +4379,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4172,7 +4395,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4188,7 +4411,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4208,7 +4431,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4229,7 +4452,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x ; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4245,7 +4468,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4261,7 +4484,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4277,7 +4500,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4297,7 +4520,7 @@ define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4318,7 +4541,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa ; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4334,7 +4557,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4350,7 +4573,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4366,7 +4589,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4386,7 +4609,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspa ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4407,7 +4630,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace ; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4423,7 +4646,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4439,7 +4662,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4455,7 +4678,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4475,7 +4698,7 @@ define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4495,3 +4718,4 @@ attributes #4 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #5 = { "denormal-fp-math"="dynamic,dynamic" } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll index 2a5e1bde029ee..7859b8bb40734 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll @@ -188,6 +188,41 @@ define half @test_atomicrmw_fmax_f16_local(ptr addrspace(3) %ptr, half %value) { define double @test_atomicrmw_fmax_f64_flat(ptr %ptr, double %value) { ; GCN-LABEL: @test_atomicrmw_fmax_f64_flat( +; GCN-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GCN-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GCN: atomicrmw.private: +; GCN-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GCN-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE:%.*]]) +; GCN-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GCN: atomicrmw.global: +; GCN-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]] +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GCN-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end1: +; GCN-NEXT: br label [[ATOMICRMW_PHI]] +; GCN: atomicrmw.phi: +; GCN-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], [[ATOMICRMW_END1]] ] +; GCN-NEXT: br label [[ATOMICRMW_END:%.*]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmax_f64_flat__noprivate(ptr %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f64_flat__noprivate( ; GCN-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: @@ -195,7 +230,7 @@ define double @test_atomicrmw_fmax_f64_flat(ptr %ptr, double %value) { ; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE:%.*]]) ; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -203,7 +238,7 @@ define double @test_atomicrmw_fmax_f64_flat(ptr %ptr, double %value) { ; GCN: atomicrmw.end: ; GCN-NEXT: ret double [[TMP6]] ; - %res = atomicrmw fmax ptr %ptr, double %value seq_cst + %res = atomicrmw fmax ptr %ptr, double %value seq_cst, !noalias.addrspace !0 ret double %res } @@ -257,6 +292,9 @@ define double @test_atomicrmw_fmax_f64_global_strictfp(ptr addrspace(1) %ptr, do %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst ret double %res } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX7: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll index 0fa409b11b42f..315af40ce3201 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll @@ -188,6 +188,41 @@ define half @test_atomicrmw_fmin_f16_local(ptr addrspace(3) %ptr, half %value) { define double @test_atomicrmw_fmin_f64_flat(ptr %ptr, double %value) { ; GCN-LABEL: @test_atomicrmw_fmin_f64_flat( +; GCN-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GCN-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GCN: atomicrmw.private: +; GCN-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GCN-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE:%.*]]) +; GCN-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GCN: atomicrmw.global: +; GCN-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]] +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GCN-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end1: +; GCN-NEXT: br label [[ATOMICRMW_PHI]] +; GCN: atomicrmw.phi: +; GCN-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], [[ATOMICRMW_END1]] ] +; GCN-NEXT: br label [[ATOMICRMW_END:%.*]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmin_f64_flat__noprivate(ptr %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f64_flat__noprivate( ; GCN-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: @@ -195,7 +230,7 @@ define double @test_atomicrmw_fmin_f64_flat(ptr %ptr, double %value) { ; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE:%.*]]) ; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -203,7 +238,7 @@ define double @test_atomicrmw_fmin_f64_flat(ptr %ptr, double %value) { ; GCN: atomicrmw.end: ; GCN-NEXT: ret double [[TMP6]] ; - %res = atomicrmw fmin ptr %ptr, double %value seq_cst + %res = atomicrmw fmin ptr %ptr, double %value seq_cst, !noalias.addrspace !0 ret double %res } @@ -257,6 +292,9 @@ define double @test_atomicrmw_fmin_f64_global_strictfp(ptr addrspace(1) %ptr, do %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst ret double %res } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX7: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll index bbcc6b8a2724f..b4e999e58d015 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll @@ -198,8 +198,8 @@ define half @test_atomicrmw_fsub_f16_local(ptr addrspace(3) %ptr, half %value) { ret half %res } -define double @test_atomicrmw_fsub_f64_flat(ptr %ptr, double %value) { -; GCN-LABEL: @test_atomicrmw_fsub_f64_flat( +define double @test_atomicrmw_fsub_f64_flat__noprivate(ptr %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fsub_f64_flat__noprivate( ; GCN-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: @@ -207,13 +207,48 @@ define double @test_atomicrmw_fsub_f64_flat(ptr %ptr, double %value) { ; GCN-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE:%.*]] ; GCN-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]] ; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GCN-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GCN: atomicrmw.end: ; GCN-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr %ptr, double %value seq_cst, !noalias.addrspace !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_flat(ptr %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fsub_f64_flat( +; GCN-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GCN-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GCN: atomicrmw.private: +; GCN-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GCN-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: [[NEW:%.*]] = fsub double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GCN-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GCN: atomicrmw.global: +; GCN-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW2:%.*]] = fsub double [[LOADED]], [[VALUE]] +; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end1: +; GCN-NEXT: br label [[ATOMICRMW_PHI]] +; GCN: atomicrmw.phi: +; GCN-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GCN-NEXT: br label [[ATOMICRMW_END:%.*]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[RES]] ; %res = atomicrmw fsub ptr %ptr, double %value seq_cst ret double %res @@ -625,3 +660,5 @@ define bfloat @test_atomicrmw_fadd_bf16_flat_system_align4(ptr %ptr, bfloat %val %res = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 4 ret bfloat %res } + +!0 = !{i32 5, i32 6} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll index 26b84f82524e2..b4025c3cfb53c 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll @@ -167,7 +167,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -184,7 +184,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -201,7 +201,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -218,7 +218,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -240,7 +240,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -257,7 +257,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -284,7 +284,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -301,7 +301,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -318,7 +318,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -335,7 +335,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -357,7 +357,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -374,7 +374,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -401,7 +401,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -418,7 +418,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -435,7 +435,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -452,7 +452,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -474,7 +474,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -491,7 +491,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -518,7 +518,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -535,7 +535,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -552,7 +552,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -569,7 +569,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -591,7 +591,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -608,7 +608,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -635,7 +635,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -652,7 +652,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -669,7 +669,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -686,7 +686,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -708,7 +708,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -725,7 +725,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -869,7 +869,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -886,7 +886,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -903,7 +903,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -920,7 +920,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -942,7 +942,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -959,7 +959,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -986,7 +986,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1003,7 +1003,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1020,7 +1020,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1037,7 +1037,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1059,7 +1059,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1076,7 +1076,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1103,7 +1103,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1120,7 +1120,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1137,7 +1137,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1154,7 +1154,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1176,7 +1176,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1193,7 +1193,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1220,7 +1220,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1237,7 +1237,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1254,7 +1254,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1271,7 +1271,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1293,7 +1293,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1310,7 +1310,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1337,7 +1337,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1354,7 +1354,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1371,7 +1371,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1388,7 +1388,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1410,7 +1410,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1427,7 +1427,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1480,7 +1480,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_gra ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1502,7 +1502,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_m ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1524,7 +1524,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_gra ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1568,7 +1568,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1590,7 +1590,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1612,7 +1612,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1660,7 +1660,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_gra ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1682,7 +1682,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_m ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1704,7 +1704,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_gra ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1748,7 +1748,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1770,7 +1770,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1792,7 +1792,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1840,7 +1840,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_gra ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1862,7 +1862,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_m ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1884,7 +1884,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_gra ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1928,7 +1928,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1950,7 +1950,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1972,7 +1972,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_deno ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1989,7 +1989,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} ;. +; GFX803: [[META0]] = !{} +;. +; GFX906: [[META0]] = !{} +;. +; GFX908: [[META0]] = !{} +;. +; GFX90A: [[META0]] = !{} +;. ; GFX940: [[META0]] = !{} ;. +; GFX10: [[META0]] = !{} +;. +; GFX11: [[META0]] = !{} +;. ; GFX12: [[META0]] = !{} ;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll index a2440def73aba..6700839d81480 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll @@ -72,7 +72,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -89,7 +89,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -106,7 +106,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -123,7 +123,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -145,7 +145,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -162,7 +162,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -189,7 +189,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_ ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -206,7 +206,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_ ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -223,7 +223,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_ ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -240,7 +240,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_ ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -262,7 +262,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_ ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -279,7 +279,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_ ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -306,7 +306,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -323,7 +323,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -340,7 +340,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -357,7 +357,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -379,7 +379,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -396,7 +396,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -423,7 +423,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -440,7 +440,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -457,7 +457,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -474,7 +474,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -496,7 +496,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -513,7 +513,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -540,7 +540,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -557,7 +557,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -574,7 +574,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -591,7 +591,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -613,7 +613,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -630,7 +630,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -679,7 +679,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -696,7 +696,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -713,7 +713,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -730,7 +730,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -752,7 +752,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -769,7 +769,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -796,7 +796,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -813,7 +813,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -830,7 +830,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -847,7 +847,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -869,7 +869,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -886,7 +886,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -913,7 +913,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -930,7 +930,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -947,7 +947,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -964,7 +964,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -986,7 +986,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1003,7 +1003,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1030,7 +1030,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1047,7 +1047,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1064,7 +1064,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1081,7 +1081,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1103,7 +1103,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1120,7 +1120,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1147,7 +1147,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1164,7 +1164,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1181,7 +1181,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1198,7 +1198,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1220,7 +1220,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1237,7 +1237,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1290,7 +1290,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1312,7 +1312,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_ ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1334,7 +1334,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1378,7 +1378,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1400,7 +1400,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1422,7 +1422,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1470,7 +1470,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1492,7 +1492,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_ ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1514,7 +1514,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1558,7 +1558,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1580,7 +1580,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1602,7 +1602,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1650,7 +1650,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1672,7 +1672,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_ ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1694,7 +1694,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1738,7 +1738,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1760,7 +1760,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1782,7 +1782,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -1799,7 +1799,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} ;. +; GFX803: [[META0]] = !{} +;. +; GFX906: [[META0]] = !{} +;. +; GFX908: [[META0]] = !{} +;. +; GFX90A: [[META0]] = !{} +;. ; GFX940: [[META0]] = !{} ;. +; GFX10: [[META0]] = !{} +;. +; GFX11: [[META0]] = !{} +;. ; GFX12: [[META0]] = !{} ;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll index 9d396aad18f23..c28e11f60f389 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll @@ -167,7 +167,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -184,7 +184,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -201,7 +201,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -228,7 +228,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -245,7 +245,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -272,7 +272,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -289,7 +289,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -306,7 +306,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -323,7 +323,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -345,7 +345,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -362,7 +362,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -389,7 +389,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -406,7 +406,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -423,7 +423,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -450,7 +450,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -467,7 +467,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -494,7 +494,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -511,7 +511,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -528,7 +528,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -555,7 +555,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -572,7 +572,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -599,7 +599,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -616,7 +616,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -633,7 +633,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -660,7 +660,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -677,7 +677,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -821,7 +821,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -838,7 +838,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -855,7 +855,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -882,7 +882,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -899,7 +899,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -926,7 +926,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -943,7 +943,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -960,7 +960,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -977,7 +977,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -999,7 +999,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1016,7 +1016,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1043,7 +1043,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1060,7 +1060,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1077,7 +1077,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1104,7 +1104,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1121,7 +1121,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1148,7 +1148,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1165,7 +1165,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1182,7 +1182,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1209,7 +1209,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1226,7 +1226,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1253,7 +1253,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1270,7 +1270,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1287,7 +1287,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1314,7 +1314,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1331,7 +1331,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1384,7 +1384,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_graine ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1406,7 +1406,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memo ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1428,7 +1428,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_graine ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1472,7 +1472,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1494,7 +1494,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1516,7 +1516,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1564,7 +1564,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_graine ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1586,7 +1586,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memo ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1608,7 +1608,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_graine ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1652,7 +1652,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1674,7 +1674,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1696,7 +1696,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1744,7 +1744,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_graine ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1766,7 +1766,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memo ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1788,7 +1788,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_graine ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1832,7 +1832,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1854,7 +1854,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1876,7 +1876,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denorma ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1893,9 +1893,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} ;. +; GFX803: [[META0]] = !{} +;. +; GFX906: [[META0]] = !{} +;. +; GFX908: [[META0]] = !{} +;. ; GFX90A: [[META0]] = !{} ;. ; GFX940: [[META0]] = !{} ;. +; GFX10: [[META0]] = !{} +;. +; GFX11: [[META0]] = !{} +;. ; GFX12: [[META0]] = !{} ;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll index 29d9473073adb..19f02dea21329 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll @@ -72,7 +72,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -89,7 +89,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -106,7 +106,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -133,7 +133,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -150,7 +150,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -177,7 +177,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -194,7 +194,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -211,7 +211,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -228,7 +228,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -250,7 +250,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -267,7 +267,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -294,7 +294,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -311,7 +311,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -328,7 +328,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -355,7 +355,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -372,7 +372,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -399,7 +399,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -416,7 +416,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -433,7 +433,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -460,7 +460,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -477,7 +477,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -504,7 +504,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -521,7 +521,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -538,7 +538,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -565,7 +565,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -582,7 +582,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -631,7 +631,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -648,7 +648,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -665,7 +665,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -692,7 +692,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -709,7 +709,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -736,7 +736,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -753,7 +753,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -770,7 +770,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -787,7 +787,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -809,7 +809,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -826,7 +826,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -853,7 +853,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -870,7 +870,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -887,7 +887,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -914,7 +914,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -931,7 +931,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -958,7 +958,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -975,7 +975,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -992,7 +992,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1019,7 +1019,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1036,7 +1036,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1063,7 +1063,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1080,7 +1080,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1097,7 +1097,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1124,7 +1124,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1141,7 +1141,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1194,7 +1194,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1216,7 +1216,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_mem ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1238,7 +1238,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1282,7 +1282,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1304,7 +1304,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1326,7 +1326,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] ; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1374,7 +1374,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1396,7 +1396,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_mem ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1418,7 +1418,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1462,7 +1462,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1484,7 +1484,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1506,7 +1506,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1554,7 +1554,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1576,7 +1576,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_mem ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1598,7 +1598,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1642,7 +1642,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1664,7 +1664,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1686,7 +1686,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm ; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) ; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 ; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -1703,9 +1703,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} ;. +; GFX803: [[META0]] = !{} +;. +; GFX906: [[META0]] = !{} +;. +; GFX908: [[META0]] = !{} +;. ; GFX90A: [[META0]] = !{} ;. ; GFX940: [[META0]] = !{} ;. +; GFX10: [[META0]] = !{} +;. +; GFX11: [[META0]] = !{} +;. ; GFX12: [[META0]] = !{} ;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll index 3de502874d323..9e9503dfbd381 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll @@ -14,7 +14,21 @@ define i64 @test_flat_atomicrmw_add_0_i64_agent(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = add i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]], !amdgpu.no.fine.grained.memory [[META1:![0-9]+]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -24,7 +38,21 @@ define i64 @test_flat_atomicrmw_add_0_i64_agent(ptr %ptr) { define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = sub i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -34,7 +62,21 @@ define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) { define i64 @test_flat_atomicrmw_or_0_i64_agent(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = or i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -44,7 +86,21 @@ define i64 @test_flat_atomicrmw_or_0_i64_agent(ptr %ptr) { define i64 @test_flat_atomicrmw_xor_0_i64_agent(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = xor i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -58,7 +114,7 @@ define i64 @test_flat_atomicrmw_xor_0_i64_agent(ptr %ptr) { define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -68,7 +124,7 @@ define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5(ptr %ptr) { define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -78,7 +134,7 @@ define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5(ptr %ptr) { define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -88,7 +144,7 @@ define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5(ptr %ptr) { define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -102,7 +158,21 @@ define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5(ptr %ptr) { define i64 @test_flat_atomicrmw_add_i64_agent(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = add i64 [[LOADED_PRIVATE]], [[VALUE]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -112,7 +182,7 @@ define i64 @test_flat_atomicrmw_add_i64_agent(ptr %ptr, i64 %value) { define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -122,7 +192,7 @@ define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5(ptr %ptr, i64 define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { ; ALL-LABEL: define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i32 [[RES]] ; %res = atomicrmw add ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -136,7 +206,20 @@ define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5(ptr %ptr, i32 define i64 @test_flat_atomicrmw_xchg_i64_agent(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: store i64 [[VALUE]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -146,7 +229,7 @@ define i64 @test_flat_atomicrmw_xchg_i64_agent(ptr %ptr, i64 %value) { define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -156,7 +239,7 @@ define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5(ptr %ptr, i define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5(ptr %ptr, i32 %value) { ; ALL-LABEL: define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i32 [[RES]] ; %res = atomicrmw xchg ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -170,7 +253,20 @@ define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5(ptr %ptr, i define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent(ptr %ptr, ptr addrspace(1) %value) { ; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent( ; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: store ptr addrspace(1) [[VALUE]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi ptr addrspace(1) [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret ptr addrspace(1) [[RES]] ; %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -180,7 +276,7 @@ define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent(ptr %ptr, ptr addrspa define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(1) %value) { ; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret ptr addrspace(1) [[RES]] ; %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -190,7 +286,7 @@ define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5 define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(3) %value) { ; ALL-LABEL: define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(3) [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(3) [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(3) [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret ptr addrspace(3) [[RES]] ; %res = atomicrmw xchg ptr %ptr, ptr addrspace(3) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -204,7 +300,21 @@ define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5 define i64 @test_flat_atomicrmw_and_i64_agent(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = and i64 [[LOADED_PRIVATE]], [[VALUE]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -214,7 +324,7 @@ define i64 @test_flat_atomicrmw_and_i64_agent(ptr %ptr, i64 %value) { define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -224,24 +334,149 @@ define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(ptr %ptr, i64 define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 ret i64 %res } - define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { ; ALL-LABEL: define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i32 [[RES]] ; %res = atomicrmw and ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret i32 %res } +define i64 @test_flat_atomicrmw_and_i64_agent__mmra(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__mmra( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = and i64 [[LOADED_PRIVATE]], [[VALUE]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !mmra [[META2:![0-9]+]], !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !mmra !4, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__mmra(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__mmra( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !mmra [[META2]], !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !mmra !4, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; -------------------------------------------------------------------- +; General expansion for subb +; -------------------------------------------------------------------- + +define i64 @test_flat_atomicrmw_sub_i64_agent(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = sub i64 [[LOADED_PRIVATE]], [[VALUE]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw sub ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__maybe_fine_grained( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 + ret i64 %res +} + +define i32 @test_flat_atomicrmw_sub_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { +; ALL-LABEL: define i32 @test_flat_atomicrmw_sub_i32_agent__noalias_addrspace_5( +; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: ret i32 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i64 @test_flat_atomicrmw_sub_i64_agent__mmra(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__mmra( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = sub i64 [[LOADED_PRIVATE]], [[VALUE]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw sub ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !mmra [[META2]], !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 %value syncscope("agent") seq_cst, !mmra !4, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__mmra(ptr %ptr, i64 %value) { +; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_i64_agent__noalias_addrspace_5__mmra( +; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !mmra [[META2]], !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: ret i64 [[RES]] +; + %res = atomicrmw sub ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !mmra !4, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + ; -------------------------------------------------------------------- ; General expansion for fadd ; -------------------------------------------------------------------- @@ -249,81 +484,165 @@ define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5(ptr %ptr, i32 define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX7-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX7: [[ATOMICRMW_PRIVATE]]: +; GFX7-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX7-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX7-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX7: [[ATOMICRMW_GLOBAL]]: +; GFX7-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX7: [[ATOMICRMW_START]]: -; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] -; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END1]]: +; GFX7-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX7: [[ATOMICRMW_PHI]]: +; GFX7-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], %[[ATOMICRMW_END1]] ] +; GFX7-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX7: [[ATOMICRMW_END]]: -; GFX7-NEXT: ret double [[TMP5]] +; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX900-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX900: [[ATOMICRMW_PRIVATE]]: +; GFX900-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX900-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX900-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX900: [[ATOMICRMW_GLOBAL]]: +; GFX900-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX900: [[ATOMICRMW_START]]: -; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] -; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX900-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END1]]: +; GFX900-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX900: [[ATOMICRMW_PHI]]: +; GFX900-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], %[[ATOMICRMW_END1]] ] +; GFX900-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX900: [[ATOMICRMW_END]]: -; GFX900-NEXT: ret double [[TMP5]] +; GFX900-NEXT: ret double [[RES]] ; ; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX908: [[ATOMICRMW_PRIVATE]]: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX908-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX908: [[ATOMICRMW_GLOBAL]]: +; GFX908-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX908: [[ATOMICRMW_START]]: -; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] -; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double -; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END1]]: +; GFX908-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX908: [[ATOMICRMW_PHI]]: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], %[[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX908: [[ATOMICRMW_END]]: ; GFX908-NEXT: ret double [[RES]] ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX90A: [[ATOMICRMW_PRIVATE]]: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_GLOBAL]]: +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_PHI]]: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX90A: [[ATOMICRMW_END]]: ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX940-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX940: [[ATOMICRMW_PRIVATE]]: +; GFX940-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX940-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX940-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX940: [[ATOMICRMW_GLOBAL]]: +; GFX940-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX940-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX940: [[ATOMICRMW_PHI]]: +; GFX940-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; GFX940-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX940: [[ATOMICRMW_END]]: ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX12-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX12: [[ATOMICRMW_PRIVATE]]: +; GFX12-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX12-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX12-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX12: [[ATOMICRMW_GLOBAL]]: +; GFX12-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX12: [[ATOMICRMW_START]]: -; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] -; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END1]]: +; GFX12-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX12: [[ATOMICRMW_PHI]]: +; GFX12-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], %[[ATOMICRMW_END1]] ] +; GFX12-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX12: [[ATOMICRMW_END]]: -; GFX12-NEXT: ret double [[TMP5]] +; GFX12-NEXT: ret double [[RES]] ; %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res @@ -339,7 +658,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -356,7 +675,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -373,7 +692,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -383,12 +702,12 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( @@ -400,7 +719,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -422,7 +741,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi ; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -439,7 +758,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi ; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -456,7 +775,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi ; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -473,7 +792,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi ; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -483,7 +802,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( @@ -495,7 +814,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi ; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -517,7 +836,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX7-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX7-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX7-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -534,7 +853,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX900-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -551,7 +870,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -565,7 +884,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX90A-NEXT: br i1 [[IS_SHARED]], label %[[ATOMICRMW_SHARED:.*]], label %[[ATOMICRMW_CHECK_PRIVATE:.*]] ; GFX90A: [[ATOMICRMW_SHARED]]: ; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) -; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]] ; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] ; GFX90A: [[ATOMICRMW_CHECK_PRIVATE]]: ; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) @@ -578,7 +897,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] ; GFX90A: [[ATOMICRMW_GLOBAL]]: ; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) -; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]] ; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] ; GFX90A: [[ATOMICRMW_PHI]]: ; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], %[[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], %[[ATOMICRMW_GLOBAL]] ] @@ -588,12 +907,12 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX940-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]] ; GFX940-NEXT: ret float [[RES]] ; ; GFX12-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]] ; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fadd ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -610,7 +929,7 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr ; GFX7-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX7-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX7-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -627,7 +946,7 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr ; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -644,7 +963,7 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -661,7 +980,7 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -671,12 +990,12 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr ; ; GFX940-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret <2 x half> [[RES]] ; ; GFX12-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -693,7 +1012,7 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; GFX7-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX7-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX7-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX7-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -710,7 +1029,7 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -727,7 +1046,7 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -744,7 +1063,7 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -754,12 +1073,12 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; ; GFX940-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret <2 x bfloat> [[RES]] ; ; GFX12-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -773,69 +1092,153 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX7-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX7: [[ATOMICRMW_PRIVATE]]: +; GFX7-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX7-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX7-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX7: [[ATOMICRMW_GLOBAL]]: +; GFX7-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX7-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX7: [[ATOMICRMW_PHI]]: +; GFX7-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX7-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX7: [[ATOMICRMW_END]]: ; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX900-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX900: [[ATOMICRMW_PRIVATE]]: +; GFX900-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX900-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX900-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX900: [[ATOMICRMW_GLOBAL]]: +; GFX900-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX900: [[ATOMICRMW_START]]: -; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX900-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX900-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END1]]: +; GFX900-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX900: [[ATOMICRMW_PHI]]: +; GFX900-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX900-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX900: [[ATOMICRMW_END]]: -; GFX900-NEXT: ret double [[TMP5]] +; GFX900-NEXT: ret double [[RES]] ; ; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX908: [[ATOMICRMW_PRIVATE]]: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX908-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX908: [[ATOMICRMW_GLOBAL]]: +; GFX908-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX908: [[ATOMICRMW_START]]: -; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double -; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX908-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX908-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END1]]: +; GFX908-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX908: [[ATOMICRMW_PHI]]: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX908: [[ATOMICRMW_END]]: ; GFX908-NEXT: ret double [[RES]] ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX90A: [[ATOMICRMW_PRIVATE]]: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX90A-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_GLOBAL]]: +; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_PHI]]: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX90A: [[ATOMICRMW_END]]: ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX940-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX940: [[ATOMICRMW_PRIVATE]]: +; GFX940-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX940-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX940-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX940: [[ATOMICRMW_GLOBAL]]: +; GFX940-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX940-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX940: [[ATOMICRMW_PHI]]: +; GFX940-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX940-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX940: [[ATOMICRMW_END]]: ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX12-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX12: [[ATOMICRMW_PRIVATE]]: +; GFX12-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX12-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX12-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX12: [[ATOMICRMW_GLOBAL]]: +; GFX12-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX12: [[ATOMICRMW_START]]: -; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX12-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX12-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END1]]: +; GFX12-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX12: [[ATOMICRMW_PHI]]: +; GFX12-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX12-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX12: [[ATOMICRMW_END]]: -; GFX12-NEXT: ret double [[TMP5]] +; GFX12-NEXT: ret double [[RES]] ; %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res @@ -844,7 +1247,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) { define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( @@ -856,7 +1259,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -873,7 +1276,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -883,12 +1286,12 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( @@ -900,7 +1303,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -922,7 +1325,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi ; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -939,7 +1342,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi ; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -956,7 +1359,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi ; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -973,7 +1376,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi ; GFX90A-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -983,7 +1386,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( @@ -995,7 +1398,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi ; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1010,7 +1413,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { ; GFX7-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( ; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: ret float [[RES]] ; ; GFX900-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( @@ -1022,7 +1425,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX900-NEXT: [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1039,7 +1442,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX908-NEXT: [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1056,7 +1459,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float @@ -1073,7 +1476,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -1083,7 +1486,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX12-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -1097,69 +1500,153 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX7-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX7: [[ATOMICRMW_PRIVATE]]: +; GFX7-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX7-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX7-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX7: [[ATOMICRMW_GLOBAL]]: +; GFX7-NEXT: [[TMP3:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX7-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX7: [[ATOMICRMW_PHI]]: +; GFX7-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX7-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX7: [[ATOMICRMW_END]]: ; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX900-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX900: [[ATOMICRMW_PRIVATE]]: +; GFX900-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX900-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX900-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX900: [[ATOMICRMW_GLOBAL]]: +; GFX900-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX900: [[ATOMICRMW_START]]: -; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX900-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX900-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END1]]: +; GFX900-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX900: [[ATOMICRMW_PHI]]: +; GFX900-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX900-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX900: [[ATOMICRMW_END]]: -; GFX900-NEXT: ret double [[TMP5]] +; GFX900-NEXT: ret double [[RES]] ; ; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX908: [[ATOMICRMW_PRIVATE]]: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX908-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX908: [[ATOMICRMW_GLOBAL]]: +; GFX908-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX908: [[ATOMICRMW_START]]: -; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double -; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX908-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX908-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END1]]: +; GFX908-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX908: [[ATOMICRMW_PHI]]: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX908: [[ATOMICRMW_END]]: ; GFX908-NEXT: ret double [[RES]] ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX90A: [[ATOMICRMW_PRIVATE]]: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX90A-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_GLOBAL]]: +; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_PHI]]: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX90A: [[ATOMICRMW_END]]: ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX940-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX940: [[ATOMICRMW_PRIVATE]]: +; GFX940-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX940-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX940-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX940: [[ATOMICRMW_GLOBAL]]: +; GFX940-NEXT: [[TMP3:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX940-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX940: [[ATOMICRMW_PHI]]: +; GFX940-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX940-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX940: [[ATOMICRMW_END]]: ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX12-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX12: [[ATOMICRMW_PRIVATE]]: +; GFX12-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX12-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX12-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX12: [[ATOMICRMW_GLOBAL]]: +; GFX12-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX12: [[ATOMICRMW_START]]: -; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX12-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX12-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END1]]: +; GFX12-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX12: [[ATOMICRMW_PHI]]: +; GFX12-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX12-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX12: [[ATOMICRMW_END]]: -; GFX12-NEXT: ret double [[TMP5]] +; GFX12-NEXT: ret double [[RES]] ; %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res @@ -1168,7 +1655,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) { define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( @@ -1180,7 +1667,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1197,7 +1684,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double @@ -1207,12 +1694,12 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( @@ -1224,7 +1711,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, ; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1246,7 +1733,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi ; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 ; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 -; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 ; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 ; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double @@ -1263,7 +1750,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi ; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1280,7 +1767,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi ; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1297,7 +1784,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi ; GFX90A-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1307,7 +1794,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( @@ -1319,7 +1806,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi ; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) ; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 ; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 ; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double @@ -1334,7 +1821,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { ; GFX7-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( ; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: ret float [[RES]] ; ; GFX900-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( @@ -1346,7 +1833,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX900-NEXT: [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX900-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX900-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1363,7 +1850,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX908-NEXT: [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -1380,7 +1867,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX90A-NEXT: [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float @@ -1397,7 +1884,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) ; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 ; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 ; GFX940-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float @@ -1407,7 +1894,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX12-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -1421,16 +1908,31 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, define i64 @test_flat_atomicrmw_nand_i64_agent(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 -; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] -; ALL: [[ATOMICRMW_START]]: -; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 ; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 -; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP3:%.*]] = load i64, ptr [[PTR]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED1:%.*]] = phi i64 [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP4:%.*]] = and i64 [[LOADED1]], [[VALUE]] +; ALL-NEXT: [[NEW2:%.*]] = xor i64 [[TMP4]], -1 +; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED1]], i64 [[NEW2]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP5]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END1]]: +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED]], %[[ATOMICRMW_PRIVATE]] ], [ [[NEWLOADED]], %[[ATOMICRMW_END1]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] ; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; @@ -1447,7 +1949,7 @@ define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5(ptr %ptr, i6 ; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] ; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 ; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] @@ -1467,7 +1969,7 @@ define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5__maybe_fine_ ; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] ; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8, !noalias.addrspace [[META0]] ; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 ; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 ; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] @@ -1488,7 +1990,7 @@ define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5(ptr %ptr, i3 ; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] ; ALL-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] ; ALL-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 -; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 ; ALL-NEXT: [[RES]] = extractvalue { i32, i1 } [[TMP3]], 0 ; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] @@ -1501,23 +2003,45 @@ define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5(ptr %ptr, i3 !0 = !{} !1 = !{i32 5, i32 6} +!2 = !{!"foo", !"bar"} +!3 = !{!"bux", !"baz"} +!4 = !{!2, !3} +!5 = !{} ;. -; GFX7: [[META0]] = !{} -; GFX7: [[META1]] = !{i32 5, i32 6} +; GFX7: [[META0]] = !{i32 5, i32 6} +; GFX7: [[META1]] = !{} +; GFX7: [[META2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]]} +; GFX7: [[META3]] = !{!"foo", !"bar"} +; GFX7: [[META4]] = !{!"bux", !"baz"} ;. -; GFX900: [[META0]] = !{} -; GFX900: [[META1]] = !{i32 5, i32 6} +; GFX900: [[META0]] = !{i32 5, i32 6} +; GFX900: [[META1]] = !{} +; GFX900: [[META2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]]} +; GFX900: [[META3]] = !{!"foo", !"bar"} +; GFX900: [[META4]] = !{!"bux", !"baz"} ;. -; GFX908: [[META0]] = !{} -; GFX908: [[META1]] = !{i32 5, i32 6} +; GFX908: [[META0]] = !{i32 5, i32 6} +; GFX908: [[META1]] = !{} +; GFX908: [[META2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]]} +; GFX908: [[META3]] = !{!"foo", !"bar"} +; GFX908: [[META4]] = !{!"bux", !"baz"} ;. -; GFX90A: [[META0]] = !{} -; GFX90A: [[META1]] = !{i32 5, i32 6} +; GFX90A: [[META0]] = !{i32 5, i32 6} +; GFX90A: [[META1]] = !{} +; GFX90A: [[META2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]]} +; GFX90A: [[META3]] = !{!"foo", !"bar"} +; GFX90A: [[META4]] = !{!"bux", !"baz"} ;. -; GFX940: [[META0]] = !{} -; GFX940: [[META1]] = !{i32 5, i32 6} +; GFX940: [[META0]] = !{i32 5, i32 6} +; GFX940: [[META1]] = !{} +; GFX940: [[META2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]]} +; GFX940: [[META3]] = !{!"foo", !"bar"} +; GFX940: [[META4]] = !{!"bux", !"baz"} ;. -; GFX12: [[META0]] = !{} -; GFX12: [[META1]] = !{i32 5, i32 6} +; GFX12: [[META0]] = !{i32 5, i32 6} +; GFX12: [[META1]] = !{} +; GFX12: [[META2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]]} +; GFX12: [[META3]] = !{!"foo", !"bar"} +; GFX12: [[META4]] = !{!"bux", !"baz"} ;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll index 8e5b7806a5904..80058b3cef4ea 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll @@ -35,8 +35,22 @@ define i32 @test_atomicrmw_or_0_global_one_as(ptr addrspace(1) %ptr) { define i32 @test_atomicrmw_or_0_flat_system(ptr %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_flat_system( ; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 0 seq_cst, align 4 -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; CHECK-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; CHECK: atomicrmw.private: +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CHECK-NEXT: [[LOADED_PRIVATE:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED_PRIVATE]], 0 +; CHECK-NEXT: store i32 [[NEW]], ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; CHECK: atomicrmw.global: +; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 0 seq_cst, align 4, !noalias.addrspace [[META1:![0-9]+]] +; CHECK-NEXT: br label [[ATOMICRMW_PHI]] +; CHECK: atomicrmw.phi: +; CHECK-NEXT: [[RES1:%.*]] = phi i32 [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[RES]], [[ATOMICRMW_GLOBAL]] ] +; CHECK-NEXT: br label [[ATOMICRMW_END:%.*]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: ret i32 [[RES1]] ; %res = atomicrmw or ptr %ptr, i32 0 seq_cst ret i32 %res diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll new file mode 100644 index 0000000000000..6b3c27be8688c --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll @@ -0,0 +1,208 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=atomic-expand %s | FileCheck %s + +define { i16, i1 } @cmpxchg_flat_agent_i16(ptr %ptr, i16 %val, i16 %swap) { +; CHECK-LABEL: define { i16, i1 } @cmpxchg_flat_agent_i16( +; CHECK-SAME: ptr [[PTR:%.*]], i16 [[VAL:%.*]], i16 [[SWAP:%.*]]) { +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[SWAP]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP5:%.*]] = zext i16 [[VAL]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]] +; CHECK-NEXT: br label %[[PARTWORD_CMPXCHG_LOOP:.*]] +; CHECK: [[PARTWORD_CMPXCHG_LOOP]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], %[[PARTWORD_CMPXCHG_FAILURE:.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") monotonic seq_cst, align 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PARTWORD_CMPXCHG_END:.*]], label %[[PARTWORD_CMPXCHG_FAILURE]] +; CHECK: [[PARTWORD_CMPXCHG_FAILURE]]: +; CHECK-NEXT: [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[PARTWORD_CMPXCHG_LOOP]], label %[[PARTWORD_CMPXCHG_END]] +; CHECK: [[PARTWORD_CMPXCHG_END]]: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i16, i1 } poison, i16 [[EXTRACTED]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { i16, i1 } [[TMP17]], i1 [[TMP14]], 1 +; CHECK-NEXT: ret { i16, i1 } [[TMP18]] +; + %result = cmpxchg ptr %ptr, i16 %val, i16 %swap syncscope("agent") monotonic seq_cst + ret { i16, i1 } %result +} + +define { i16, i1 } @cmpxchg_flat_agent_i16_align4(ptr %ptr, i16 %val, i16 %swap) { +; CHECK-LABEL: define { i16, i1 } @cmpxchg_flat_agent_i16_align4( +; CHECK-SAME: ptr [[PTR:%.*]], i16 [[VAL:%.*]], i16 [[SWAP:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[SWAP]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[VAL]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[PTR]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], -65536 +; CHECK-NEXT: br label %[[PARTWORD_CMPXCHG_LOOP:.*]] +; CHECK: [[PARTWORD_CMPXCHG_LOOP]]: +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[TMP11:%.*]], %[[PARTWORD_CMPXCHG_FAILURE:.*]] ] +; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP7]], i32 [[TMP6]] syncscope("agent") monotonic seq_cst, align 4 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i32, i1 } [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PARTWORD_CMPXCHG_END:.*]], label %[[PARTWORD_CMPXCHG_FAILURE]] +; CHECK: [[PARTWORD_CMPXCHG_FAILURE]]: +; CHECK-NEXT: [[TMP11]] = and i32 [[TMP9]], -65536 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP5]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[PARTWORD_CMPXCHG_LOOP]], label %[[PARTWORD_CMPXCHG_END]] +; CHECK: [[PARTWORD_CMPXCHG_END]]: +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[TMP9]] to i16 +; CHECK-NEXT: [[TMP13:%.*]] = insertvalue { i16, i1 } poison, i16 [[EXTRACTED]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertvalue { i16, i1 } [[TMP13]], i1 [[TMP10]], 1 +; CHECK-NEXT: ret { i16, i1 } [[TMP14]] +; + %result = cmpxchg ptr %ptr, i16 %val, i16 %swap syncscope("agent") monotonic seq_cst, align 4 + ret { i16, i1 } %result +} + +define { i32, i1 } @cmpxchg_flat_agent_i32(ptr %ptr, i32 %val, i32 %swap) { +; CHECK-LABEL: define { i32, i1 } @cmpxchg_flat_agent_i32( +; CHECK-SAME: ptr [[PTR:%.*]], i32 [[VAL:%.*]], i32 [[SWAP:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i32 [[VAL]], i32 [[SWAP]] syncscope("agent") monotonic seq_cst, align 4 +; CHECK-NEXT: ret { i32, i1 } [[RESULT]] +; + %result = cmpxchg ptr %ptr, i32 %val, i32 %swap syncscope("agent") monotonic seq_cst + ret { i32, i1 } %result +} + +define { i64, i1 } @cmpxchg_flat_agent_i64(ptr %ptr, i64 %val, i64 %swap) { +; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8 +; CHECK-NEXT: ret { i64, i1 } [[RESULT]] +; + %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst + ret { i64, i1 } %result +} + +define { i64, i1 } @cmpxchg_flat_agent_i64_volatile(ptr %ptr, i64 %val, i64 %swap) { +; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64_volatile( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg volatile ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8 +; CHECK-NEXT: ret { i64, i1 } [[RESULT]] +; + %result = cmpxchg volatile ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst + ret { i64, i1 } %result +} + +define { i16, i1 } @cmpxchg_flat_agent_i16__noprivate(ptr %ptr, i16 %val, i16 %swap) { +; CHECK-LABEL: define { i16, i1 } @cmpxchg_flat_agent_i16__noprivate( +; CHECK-SAME: ptr [[PTR:%.*]], i16 [[VAL:%.*]], i16 [[SWAP:%.*]]) { +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR]], i64 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[SWAP]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP5:%.*]] = zext i16 [[VAL]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]] +; CHECK-NEXT: br label %[[PARTWORD_CMPXCHG_LOOP:.*]] +; CHECK: [[PARTWORD_CMPXCHG_LOOP]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], %[[PARTWORD_CMPXCHG_FAILURE:.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") monotonic seq_cst, align 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PARTWORD_CMPXCHG_END:.*]], label %[[PARTWORD_CMPXCHG_FAILURE]] +; CHECK: [[PARTWORD_CMPXCHG_FAILURE]]: +; CHECK-NEXT: [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[PARTWORD_CMPXCHG_LOOP]], label %[[PARTWORD_CMPXCHG_END]] +; CHECK: [[PARTWORD_CMPXCHG_END]]: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i16, i1 } poison, i16 [[EXTRACTED]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { i16, i1 } [[TMP17]], i1 [[TMP14]], 1 +; CHECK-NEXT: ret { i16, i1 } [[TMP18]] +; + %result = cmpxchg ptr %ptr, i16 %val, i16 %swap syncscope("agent") monotonic seq_cst, !noalias.addrspace !0 + ret { i16, i1 } %result +} + +define { i32, i1 } @cmpxchg_flat_agent_i32__noprivate(ptr %ptr, i32 %val, i32 %swap) { +; CHECK-LABEL: define { i32, i1 } @cmpxchg_flat_agent_i32__noprivate( +; CHECK-SAME: ptr [[PTR:%.*]], i32 [[VAL:%.*]], i32 [[SWAP:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i32 [[VAL]], i32 [[SWAP]] syncscope("agent") monotonic seq_cst, align 4, !noalias.addrspace [[META0:![0-9]+]] +; CHECK-NEXT: ret { i32, i1 } [[RESULT]] +; + %result = cmpxchg ptr %ptr, i32 %val, i32 %swap syncscope("agent") monotonic seq_cst, !noalias.addrspace !0 + ret { i32, i1 } %result +} + +define { i64, i1 } @cmpxchg_flat_agent_i64__noprivate(ptr %ptr, i64 %val, i64 %swap) { +; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64__noprivate( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !noalias.addrspace [[META0]] +; CHECK-NEXT: ret { i64, i1 } [[RESULT]] +; + %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst, !noalias.addrspace !0 + ret { i64, i1 } %result +} + +define { i64, i1 } @cmpxchg_flat_agent_i64__nolocal(ptr %ptr, i64 %val, i64 %swap) { +; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64__nolocal( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]] +; CHECK-NEXT: ret { i64, i1 } [[RESULT]] +; + %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst, !noalias.addrspace !1 + ret { i64, i1 } %result +} + +define { i64, i1 } @cmpxchg_flat_agent_i64_mmra(ptr %ptr, i64 %val, i64 %swap) { +; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64_mmra( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !mmra [[META2:![0-9]+]] +; CHECK-NEXT: ret { i64, i1 } [[RESULT]] +; + %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst, !mmra !4 + ret { i64, i1 } %result +} + +define { i64, i1 } @cmpxchg_flat_agent_i64_mmra_noprivate(ptr %ptr, i64 %val, i64 %swap) { +; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64_mmra_noprivate( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !mmra [[META2]], !noalias.addrspace [[META1]] +; CHECK-NEXT: ret { i64, i1 } [[RESULT]] +; + %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst, !noalias.addrspace !1, !mmra !4 + ret { i64, i1 } %result +} + +!0 = !{i32 5, i32 6} +!1 = !{i32 3, i32 4} +!2 = !{!"foo", !"bar"} +!3 = !{!"bux", !"baz"} +!4 = !{!2, !3} +!5 = !{} + + +;. +; CHECK: [[META0]] = !{i32 5, i32 6} +; CHECK: [[META1]] = !{i32 3, i32 4} +; CHECK: [[META2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]]} +; CHECK: [[META3]] = !{!"foo", !"bar"} +; CHECK: [[META4]] = !{!"bux", !"baz"} +;. diff --git a/llvm/test/Transforms/Attributor/nofpclass-phiselect.ll b/llvm/test/Transforms/Attributor/nofpclass-phiselect.ll index 6635280bc4360..7c753bdd76976 100644 --- a/llvm/test/Transforms/Attributor/nofpclass-phiselect.ll +++ b/llvm/test/Transforms/Attributor/nofpclass-phiselect.ll @@ -2,7 +2,7 @@ ; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -S < %s | FileCheck %s define float @phi_select(i1 %c, float nofpclass(inf) %base, float nofpclass(inf) %arg) { -; CHECK-LABEL: define float @phi_select +; CHECK-LABEL: define nofpclass(inf) float @phi_select ; CHECK-SAME: (i1 [[C:%.*]], float nofpclass(inf) [[BASE:%.*]], float nofpclass(inf) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -25,6 +25,30 @@ exit: ret float %select } +define float @phi_select_c(i1 %c, float nofpclass(inf) %base, float nofpclass(inf) %arg) { +; CHECK-LABEL: define nofpclass(inf) float @phi_select_c +; CHECK-SAME: (i1 [[C:%.*]], float nofpclass(inf) [[BASE:%.*]], float nofpclass(inf) [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PHI:%.*]] = phi float [ [[BASE]], [[ENTRY:%.*]] ], [ [[SELECT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SELECT]] = select i1 [[C]], float [[ARG]], float [[PHI]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret float [[SELECT]] +; +entry: + br label %loop + +loop: + %phi = phi float [ %base, %entry ], [ %select, %loop ] + %select = select i1 %c, float %arg, float %phi + br i1 %c, label %loop, label %exit + +exit: + ret float %select +} + define float @phi_select_onlybase(i1 %c, float nofpclass(inf) %base, float %arg) { ; CHECK-LABEL: define float @phi_select_onlybase ; CHECK-SAME: (i1 [[C:%.*]], float nofpclass(inf) [[BASE:%.*]], float [[ARG:%.*]]) #[[ATTR0]] { @@ -74,7 +98,7 @@ exit: } define float @phi_phi(i1 %c, float nofpclass(inf) %base, float nofpclass(inf) %arg) { -; CHECK-LABEL: define float @phi_phi +; CHECK-LABEL: define nofpclass(inf) float @phi_phi ; CHECK-SAME: (i1 [[C:%.*]], float nofpclass(inf) [[BASE:%.*]], float nofpclass(inf) [[ARG:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll index abfbf2e5e582e..33d18d0e2a795 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll @@ -319,20 +319,20 @@ for.body.tail: define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind { ; CHECK-LABEL: define void @simple_urem_to_sel_vec( ; CHECK-SAME: <2 x i64> [[REM_AMT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[FOR_COND_CLEANUP:.*]]: +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[ENTRY:.*]]: +; CHECK: [[FOR_COND_CLEANUP:.*]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[FOR_COND_CLEANUP]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[FOR_COND_CLEANUP]] ] +; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[ENTRY]] ] ; CHECK-NEXT: tail call void @use.2xi64(<2 x i64> [[REM]]) ; CHECK-NEXT: [[TMP1:%.*]] = add nuw <2 x i64> [[REM]], ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], [[REM_AMT]] ; CHECK-NEXT: [[TMP3]] = select <2 x i1> [[TMP2]], <2 x i64> zeroinitializer, <2 x i64> [[TMP1]] ; CHECK-NEXT: [[INC]] = add nuw <2 x i64> [[I_04]], ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = call i1 @get.i1() -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[ENTRY]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] ; entry: br label %for.body @@ -892,10 +892,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_ ; CHECK: [[FOR_COND_CLEANUP]]: ; CHECK-NEXT: ret void ; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5 -; CHECK-NEXT: [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]] ; CHECK-NEXT: tail call void @use.i32(i32 [[REM]]) +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[REM]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]] +; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]] ; CHECK-NEXT: [[INC]] = add nuw i32 [[I_04]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] diff --git a/llvm/test/Transforms/EarlyCSE/nofpclass-phi-regression.ll b/llvm/test/Transforms/EarlyCSE/nofpclass-phi-regression.ll index 81cead29ac910..6fc6b73504982 100644 --- a/llvm/test/Transforms/EarlyCSE/nofpclass-phi-regression.ll +++ b/llvm/test/Transforms/EarlyCSE/nofpclass-phi-regression.ll @@ -20,7 +20,6 @@ define void @compute_known_fpclass_phi_assert(i1 %cmp46, i1 %tobool51, ptr %a01) ; CHECK-NEXT: [[TMP1]] = phi double [ 0.000000e+00, [[IF_THEN52]] ], [ [[TMP0]], [[FOR_BODY48]] ] ; CHECK-NEXT: br label [[FOR_COND45]] ; CHECK: for.end82: -; CHECK-NEXT: [[MUL84:%.*]] = fmul double [[TMP0]], 0.000000e+00 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/GVN/pr113997.ll b/llvm/test/Transforms/GVN/pr113997.ll new file mode 100644 index 0000000000000..35e73b1a4439b --- /dev/null +++ b/llvm/test/Transforms/GVN/pr113997.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=gvn < %s | FileCheck %s + +; Make sure attributes in function calls are intersected correctly. + +define i1 @bucket(i32 noundef %x) { +; CHECK-LABEL: define i1 @bucket( +; CHECK-SAME: i32 noundef [[X:%.*]]) { +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[X]], 0 +; CHECK-NEXT: [[CTPOP1:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ult i32 [[CTPOP1]], 2 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 false +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CTPOP1]], 1 +; CHECK-NEXT: ret i1 [[RES]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: ret i1 false +; + %cmp1 = icmp sgt i32 %x, 0 + %ctpop1 = tail call range(i32 1, 32) i32 @llvm.ctpop.i32(i32 %x) + %cmp2 = icmp samesign ult i32 %ctpop1, 2 + %cond = select i1 %cmp1, i1 %cmp2, i1 false + br i1 %cond, label %if.then, label %if.else + +if.else: + %ctpop2 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %x) + %res = icmp eq i32 %ctpop2, 1 + ret i1 %res + +if.then: + ret i1 false +} diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index a0856ac9127e6..e2dada85ef872 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -24,7 +24,7 @@ entry: %i = add nsw i32 %a, -1 %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr %b, i64 %i.2 - %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -59,7 +59,7 @@ entry: %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -107,9 +107,9 @@ bb1: ; preds = %entry %i.7 = ptrtoint ptr addrspace(1) %i.3 to i64 %i.8 = add nsw i64 %i.7, 1 %i.9 = inttoptr i64 %i.8 to ptr addrspace(1) - %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 %i.11 = addrspacecast ptr addrspace(1) %i.9 to ptr - %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -175,3 +175,4 @@ attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite attributes #1 = { mustprogress nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll index d8266f4c6703d..452d90aa98d88 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll @@ -589,6 +589,15 @@ declare ptr @gets(ptr) ; CHECK: declare noundef i32 @gettimeofday(ptr nocapture noundef, ptr nocapture noundef) [[NOFREE_NOUNWIND]] declare i32 @gettimeofday(ptr, ptr) +; CHECK: declare double @hypot(double, double) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare double @hypot(double, double) + +; CHECK: declare float @hypotf(float, float) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare float @hypotf(float, float) + +; CHECK: declare x86_fp80 @hypotl(x86_fp80, x86_fp80) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare x86_fp80 @hypotl(x86_fp80, x86_fp80) + ; CHECK: declare i32 @isascii(i32) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] declare i32 @isascii(i32) diff --git a/llvm/test/Transforms/Inline/convergence-inline.ll b/llvm/test/Transforms/Inline/convergence-inline.ll index 8c67e6a59b7db..4996a2376be63 100644 --- a/llvm/test/Transforms/Inline/convergence-inline.ll +++ b/llvm/test/Transforms/Inline/convergence-inline.ll @@ -185,6 +185,30 @@ define void @test_two_calls() convergent { ret void } +define i32 @token_not_first(i32 %x) convergent alwaysinline { +; CHECK-LABEL: @token_not_first( +; CHECK-NEXT: {{%.*}} = alloca ptr, align 8 +; CHECK-NEXT: [[TOKEN:%.*]] = call token @llvm.experimental.convergence.entry() +; CHECK-NEXT: [[Y:%.*]] = call i32 @g(i32 [[X:%.*]]) [ "convergencectrl"(token [[TOKEN]]) ] +; CHECK-NEXT: ret i32 [[Y]] +; + %p = alloca ptr, align 8 + %token = call token @llvm.experimental.convergence.entry() + %y = call i32 @g(i32 %x) [ "convergencectrl"(token %token) ] + ret i32 %y +} + +define void @test_token_not_first() convergent { +; CHECK-LABEL: @test_token_not_first( +; CHECK-NEXT: [[TOKEN:%.*]] = call token @llvm.experimental.convergence.entry() +; CHECK-NEXT: {{%.*}} = call i32 @g(i32 23) [ "convergencectrl"(token [[TOKEN]]) ] +; CHECK-NEXT: ret void +; + %token = call token @llvm.experimental.convergence.entry() + %x = call i32 @token_not_first(i32 23) [ "convergencectrl"(token %token) ] + ret void +} + declare void @f(i32) convergent declare i32 @g(i32) convergent diff --git a/llvm/test/Transforms/InstCombine/NVPTX/isspacep.ll b/llvm/test/Transforms/InstCombine/NVPTX/isspacep.ll new file mode 100644 index 0000000000000..dedd85e1a8cda --- /dev/null +++ b/llvm/test/Transforms/InstCombine/NVPTX/isspacep.ll @@ -0,0 +1,277 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instcombine -mtriple=nvptx64-nvidia-cuda -S | FileCheck %s +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; Source data in different AS. +@shared_data = dso_local addrspace(3) global i32 undef, align 4 +@global_data = dso_local addrspace(1) externally_initialized global i32 0, align 4 +@const_data = dso_local addrspace(4) externally_initialized constant i32 3, align 4 + +; Results get stored here. +@gen = dso_local addrspace(1) externally_initialized global i8 0, align 1 +@g1 = dso_local addrspace(1) externally_initialized global i8 0, align 1 +@g2 = dso_local addrspace(1) externally_initialized global i8 0, align 1 +@s1 = dso_local addrspace(1) externally_initialized global i8 0, align 1 +@s2 = dso_local addrspace(1) externally_initialized global i8 0, align 1 +@c1 = dso_local addrspace(1) externally_initialized global i8 0, align 1 +@c2 = dso_local addrspace(1) externally_initialized global i8 0, align 1 +@l = dso_local addrspace(1) externally_initialized global i8 0, align 1 + +declare i1 @llvm.nvvm.isspacep.global(ptr nocapture) +declare i1 @llvm.nvvm.isspacep.shared(ptr nocapture) +declare i1 @llvm.nvvm.isspacep.const(ptr nocapture) +declare i1 @llvm.nvvm.isspacep.local(ptr nocapture) + +define dso_local void @check_global(ptr nocapture noundef readnone %out, ptr nocapture noundef readnone %genp, +; CHECK-LABEL: define dso_local void @check_global( +; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr nocapture noundef readnone [[GENP:%.*]], ptr addrspace(1) [[GP:%.*]], ptr addrspace(3) [[SP:%.*]], ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[LP:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEN0:%.*]] = tail call i1 @llvm.nvvm.isspacep.global(ptr [[GENP]]) +; CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[GEN0]] to i8 +; CHECK-NEXT: store i8 [[STOREDV]], ptr addrspacecast (ptr addrspace(1) @gen to ptr), align 1 +; CHECK-NEXT: store i8 1, ptr addrspacecast (ptr addrspace(1) @g1 to ptr), align 1 +; CHECK-NEXT: store i8 1, ptr addrspacecast (ptr addrspace(1) @g2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @s1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @s2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @c1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @c2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @l to ptr), align 1 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %gp, + ptr addrspace(3) %sp, + ptr addrspace(4) %cp, + ptr addrspace(5) %lp) local_unnamed_addr { +entry: + ; No constant folding for generic pointers of unknown origin. + %gen0 = tail call i1 @llvm.nvvm.isspacep.global(ptr %genp) + %storedv = zext i1 %gen0 to i8 + store i8 %storedv, ptr addrspacecast (ptr addrspace(1) @gen to ptr), align 1 + + %isg1 = tail call i1 @llvm.nvvm.isspacep.global(ptr addrspacecast (ptr addrspace(1) @global_data to ptr)) + %isg18 = zext i1 %isg1 to i8 + store i8 %isg18, ptr addrspacecast (ptr addrspace(1) @g1 to ptr), align 1 + + %gp_asc = addrspacecast ptr addrspace(1) %gp to ptr + %isg2 = tail call i1 @llvm.nvvm.isspacep.global(ptr %gp_asc) + %isg28 = zext i1 %isg2 to i8 + store i8 %isg28, ptr addrspacecast (ptr addrspace(1) @g2 to ptr), align 1 + + %iss1 = tail call i1 @llvm.nvvm.isspacep.global(ptr addrspacecast (ptr addrspace(3) @shared_data to ptr)) + %iss18 = zext i1 %iss1 to i8 + store i8 %iss18, ptr addrspacecast (ptr addrspace(1) @s1 to ptr), align 1 + + %sp_asc = addrspacecast ptr addrspace(3) %sp to ptr + %iss2 = tail call i1 @llvm.nvvm.isspacep.global(ptr %sp_asc) + %iss28 = zext i1 %iss2 to i8 + store i8 %iss28, ptr addrspacecast (ptr addrspace(1) @s2 to ptr), align 1 + + %isc1 = tail call i1 @llvm.nvvm.isspacep.global(ptr addrspacecast (ptr addrspace(4) @const_data to ptr)) + %isc18 = zext i1 %isc1 to i8 + store i8 %isc18, ptr addrspacecast (ptr addrspace(1) @c1 to ptr), align 1 + + %cp_asc = addrspacecast ptr addrspace(4) %cp to ptr + %isc2 = tail call i1 @llvm.nvvm.isspacep.global(ptr %cp_asc) + %isc28 = zext i1 %isc2 to i8 + store i8 %isc28, ptr addrspacecast (ptr addrspace(1) @c2 to ptr), align 1 + + ; Local data can't ihave a constant address, so we can't have a constant ASC expression + ; We can only use an ASC instruction. + %lp_asc = addrspacecast ptr addrspace(5) %lp to ptr + %isl = call i1 @llvm.nvvm.isspacep.global(ptr nonnull %lp_asc) + %isl8 = zext i1 %isl to i8 + store i8 %isl8, ptr addrspacecast (ptr addrspace(1) @l to ptr), align 1 + + ret void +} + +define dso_local void @check_shared(ptr nocapture noundef readnone %out, ptr nocapture noundef readnone %genp, +; CHECK-LABEL: define dso_local void @check_shared( +; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr nocapture noundef readnone [[GENP:%.*]], ptr addrspace(1) [[GP:%.*]], ptr addrspace(3) [[SP:%.*]], ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[LP:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEN0:%.*]] = tail call i1 @llvm.nvvm.isspacep.shared(ptr [[GENP]]) +; CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[GEN0]] to i8 +; CHECK-NEXT: store i8 [[STOREDV]], ptr addrspacecast (ptr addrspace(1) @gen to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @g1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @g2 to ptr), align 1 +; CHECK-NEXT: store i8 1, ptr addrspacecast (ptr addrspace(1) @s1 to ptr), align 1 +; CHECK-NEXT: store i8 1, ptr addrspacecast (ptr addrspace(1) @s2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @c1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @c2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @l to ptr), align 1 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %gp, + ptr addrspace(3) %sp, + ptr addrspace(4) %cp, + ptr addrspace(5) %lp) local_unnamed_addr { +entry: + ; No constant folding for generic pointers of unknown origin. + %gen0 = tail call i1 @llvm.nvvm.isspacep.shared(ptr %genp) + %storedv = zext i1 %gen0 to i8 + store i8 %storedv, ptr addrspacecast (ptr addrspace(1) @gen to ptr), align 1 + + %isg1 = tail call i1 @llvm.nvvm.isspacep.shared(ptr addrspacecast (ptr addrspace(1) @global_data to ptr)) + %isg18 = zext i1 %isg1 to i8 + store i8 %isg18, ptr addrspacecast (ptr addrspace(1) @g1 to ptr), align 1 + + %gp_asc = addrspacecast ptr addrspace(1) %gp to ptr + %isg2 = tail call i1 @llvm.nvvm.isspacep.shared(ptr %gp_asc) + %isg28 = zext i1 %isg2 to i8 + store i8 %isg28, ptr addrspacecast (ptr addrspace(1) @g2 to ptr), align 1 + + %iss1 = tail call i1 @llvm.nvvm.isspacep.shared(ptr addrspacecast (ptr addrspace(3) @shared_data to ptr)) + %iss18 = zext i1 %iss1 to i8 + store i8 %iss18, ptr addrspacecast (ptr addrspace(1) @s1 to ptr), align 1 + + %sp_asc = addrspacecast ptr addrspace(3) %sp to ptr + %iss2 = tail call i1 @llvm.nvvm.isspacep.shared(ptr %sp_asc) + %iss28 = zext i1 %iss2 to i8 + store i8 %iss28, ptr addrspacecast (ptr addrspace(1) @s2 to ptr), align 1 + + %isc1 = tail call i1 @llvm.nvvm.isspacep.shared(ptr addrspacecast (ptr addrspace(4) @const_data to ptr)) + %isc18 = zext i1 %isc1 to i8 + store i8 %isc18, ptr addrspacecast (ptr addrspace(1) @c1 to ptr), align 1 + + %cp_asc = addrspacecast ptr addrspace(4) %cp to ptr + %isc2 = tail call i1 @llvm.nvvm.isspacep.shared(ptr %cp_asc) + %isc28 = zext i1 %isc2 to i8 + store i8 %isc28, ptr addrspacecast (ptr addrspace(1) @c2 to ptr), align 1 + + ; Local data can't have a constant address, so we can't have a constant ASC expression + ; We can only use an ASC instruction. + %lp_asc = addrspacecast ptr addrspace(5) %lp to ptr + %isl = call i1 @llvm.nvvm.isspacep.shared(ptr nonnull %lp_asc) + %isl8 = zext i1 %isl to i8 + store i8 %isl8, ptr addrspacecast (ptr addrspace(1) @l to ptr), align 1 + + ret void +} + +define dso_local void @check_const(ptr nocapture noundef readnone %out, ptr nocapture noundef readnone %genp, +; CHECK-LABEL: define dso_local void @check_const( +; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr nocapture noundef readnone [[GENP:%.*]], ptr addrspace(1) [[GP:%.*]], ptr addrspace(3) [[SP:%.*]], ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[LP:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEN0:%.*]] = tail call i1 @llvm.nvvm.isspacep.const(ptr [[GENP]]) +; CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[GEN0]] to i8 +; CHECK-NEXT: store i8 [[STOREDV]], ptr addrspacecast (ptr addrspace(1) @gen to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @g1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @g2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @s1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @s2 to ptr), align 1 +; CHECK-NEXT: store i8 1, ptr addrspacecast (ptr addrspace(1) @c1 to ptr), align 1 +; CHECK-NEXT: store i8 1, ptr addrspacecast (ptr addrspace(1) @c2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @l to ptr), align 1 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %gp, + ptr addrspace(3) %sp, + ptr addrspace(4) %cp, + ptr addrspace(5) %lp) local_unnamed_addr { +entry: + ; No constant folding for generic pointers of unknown origin. + %gen0 = tail call i1 @llvm.nvvm.isspacep.const(ptr %genp) + %storedv = zext i1 %gen0 to i8 + store i8 %storedv, ptr addrspacecast (ptr addrspace(1) @gen to ptr), align 1 + + %isg1 = tail call i1 @llvm.nvvm.isspacep.const(ptr addrspacecast (ptr addrspace(1) @global_data to ptr)) + %isg18 = zext i1 %isg1 to i8 + store i8 %isg18, ptr addrspacecast (ptr addrspace(1) @g1 to ptr), align 1 + + %gp_asc = addrspacecast ptr addrspace(1) %gp to ptr + %isg2 = tail call i1 @llvm.nvvm.isspacep.const(ptr %gp_asc) + %isg28 = zext i1 %isg2 to i8 + store i8 %isg28, ptr addrspacecast (ptr addrspace(1) @g2 to ptr), align 1 + + %iss1 = tail call i1 @llvm.nvvm.isspacep.const(ptr addrspacecast (ptr addrspace(3) @shared_data to ptr)) + %iss18 = zext i1 %iss1 to i8 + store i8 %iss18, ptr addrspacecast (ptr addrspace(1) @s1 to ptr), align 1 + + %sp_asc = addrspacecast ptr addrspace(3) %sp to ptr + %iss2 = tail call i1 @llvm.nvvm.isspacep.const(ptr %sp_asc) + %iss28 = zext i1 %iss2 to i8 + store i8 %iss28, ptr addrspacecast (ptr addrspace(1) @s2 to ptr), align 1 + + %isc1 = tail call i1 @llvm.nvvm.isspacep.const(ptr addrspacecast (ptr addrspace(4) @const_data to ptr)) + %isc18 = zext i1 %isc1 to i8 + store i8 %isc18, ptr addrspacecast (ptr addrspace(1) @c1 to ptr), align 1 + + %cp_asc = addrspacecast ptr addrspace(4) %cp to ptr + %isc2 = tail call i1 @llvm.nvvm.isspacep.const(ptr %cp_asc) + %isc28 = zext i1 %isc2 to i8 + store i8 %isc28, ptr addrspacecast (ptr addrspace(1) @c2 to ptr), align 1 + + ; Local data can't have a constant address, so we can't have a constant ASC expression + ; We can only use an ASC instruction. + %lp_asc = addrspacecast ptr addrspace(5) %lp to ptr + %isl = call i1 @llvm.nvvm.isspacep.const(ptr nonnull %lp_asc) + %isl8 = zext i1 %isl to i8 + store i8 %isl8, ptr addrspacecast (ptr addrspace(1) @l to ptr), align 1 + + ret void +} + +define dso_local void @check_local(ptr nocapture noundef readnone %out, ptr nocapture noundef readnone %genp, +; CHECK-LABEL: define dso_local void @check_local( +; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr nocapture noundef readnone [[GENP:%.*]], ptr addrspace(1) [[GP:%.*]], ptr addrspace(3) [[SP:%.*]], ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[LP:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEN0:%.*]] = tail call i1 @llvm.nvvm.isspacep.local(ptr [[GENP]]) +; CHECK-NEXT: [[STOREDV:%.*]] = zext i1 [[GEN0]] to i8 +; CHECK-NEXT: store i8 [[STOREDV]], ptr addrspacecast (ptr addrspace(1) @gen to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @g1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @g2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @s1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @s2 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @c1 to ptr), align 1 +; CHECK-NEXT: store i8 0, ptr addrspacecast (ptr addrspace(1) @c2 to ptr), align 1 +; CHECK-NEXT: store i8 1, ptr addrspacecast (ptr addrspace(1) @l to ptr), align 1 +; CHECK-NEXT: ret void +; + ptr addrspace(1) %gp, + ptr addrspace(3) %sp, + ptr addrspace(4) %cp, + ptr addrspace(5) %lp) local_unnamed_addr { +entry: + ; No constant folding for generic pointers of unknown origin. + %gen0 = tail call i1 @llvm.nvvm.isspacep.local(ptr %genp) + %storedv = zext i1 %gen0 to i8 + store i8 %storedv, ptr addrspacecast (ptr addrspace(1) @gen to ptr), align 1 + + %isg1 = tail call i1 @llvm.nvvm.isspacep.local(ptr addrspacecast (ptr addrspace(1) @global_data to ptr)) + %isg18 = zext i1 %isg1 to i8 + store i8 %isg18, ptr addrspacecast (ptr addrspace(1) @g1 to ptr), align 1 + + %gp_asc = addrspacecast ptr addrspace(1) %gp to ptr + %isg2 = tail call i1 @llvm.nvvm.isspacep.local(ptr %gp_asc) + %isg28 = zext i1 %isg2 to i8 + store i8 %isg28, ptr addrspacecast (ptr addrspace(1) @g2 to ptr), align 1 + + %iss1 = tail call i1 @llvm.nvvm.isspacep.local(ptr addrspacecast (ptr addrspace(3) @shared_data to ptr)) + %iss18 = zext i1 %iss1 to i8 + store i8 %iss18, ptr addrspacecast (ptr addrspace(1) @s1 to ptr), align 1 + + %sp_asc = addrspacecast ptr addrspace(3) %sp to ptr + %iss2 = tail call i1 @llvm.nvvm.isspacep.local(ptr %sp_asc) + %iss28 = zext i1 %iss2 to i8 + store i8 %iss28, ptr addrspacecast (ptr addrspace(1) @s2 to ptr), align 1 + + %isc1 = tail call i1 @llvm.nvvm.isspacep.local(ptr addrspacecast (ptr addrspace(4) @const_data to ptr)) + %isc18 = zext i1 %isc1 to i8 + store i8 %isc18, ptr addrspacecast (ptr addrspace(1) @c1 to ptr), align 1 + + %cp_asc = addrspacecast ptr addrspace(4) %cp to ptr + %isc2 = tail call i1 @llvm.nvvm.isspacep.local(ptr %cp_asc) + %isc28 = zext i1 %isc2 to i8 + store i8 %isc28, ptr addrspacecast (ptr addrspace(1) @c2 to ptr), align 1 + + ; Local data can't have a constant address, so we can't have a constant ASC expression + ; We can only use an ASC instruction. + %lp_asc = addrspacecast ptr addrspace(5) %lp to ptr + %isl = call i1 @llvm.nvvm.isspacep.local(ptr nonnull %lp_asc) + %isl8 = zext i1 %isl to i8 + store i8 %isl8, ptr addrspacecast (ptr addrspace(1) @l to ptr), align 1 + + ret void +} + diff --git a/llvm/test/Transforms/InstCombine/div.ll b/llvm/test/Transforms/InstCombine/div.ll index a91c9bfc91c40..d5d7ce9b7b263 100644 --- a/llvm/test/Transforms/InstCombine/div.ll +++ b/llvm/test/Transforms/InstCombine/div.ll @@ -1163,7 +1163,8 @@ define <2 x i8> @sdiv_constant_dividend_select_of_constants_divisor_vec(i1 %b) { define <2 x i8> @sdiv_constant_dividend_select_of_constants_divisor_vec_ub1(i1 %b) { ; CHECK-LABEL: @sdiv_constant_dividend_select_of_constants_divisor_vec_ub1( -; CHECK-NEXT: ret <2 x i8> +; CHECK-NEXT: [[R:%.*]] = select i1 [[B:%.*]], <2 x i8> , <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = select i1 %b, <2 x i8> , <2 x i8> %r = sdiv <2 x i8> , %s @@ -1269,7 +1270,8 @@ define <2 x i8> @udiv_constant_dividend_select_of_constants_divisor_vec(i1 %b) { define <2 x i8> @udiv_constant_dividend_select_of_constants_divisor_vec_ub1(i1 %b) { ; CHECK-LABEL: @udiv_constant_dividend_select_of_constants_divisor_vec_ub1( -; CHECK-NEXT: ret <2 x i8> +; CHECK-NEXT: [[R:%.*]] = select i1 [[B:%.*]], <2 x i8> , <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = select i1 %b, <2 x i8> , <2 x i8> %r = udiv <2 x i8> , %s diff --git a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll index 1c28b151825c1..caf38c676e20d 100644 --- a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll +++ b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll @@ -468,7 +468,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nsz float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -482,7 +482,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul ninf nsz float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -496,7 +496,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan nsz float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -510,7 +510,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(float %x, float %y) { ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -559,7 +559,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x, float % ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -572,7 +572,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(float %x ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -586,7 +586,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_ne ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -599,7 +599,7 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_ne ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero_negsub( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 @@ -706,7 +706,7 @@ define float @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x) { ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz( ; CHECK-NEXT: [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 ; CHECK-NEXT: [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00 -; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[X]], [[SCALED_X]] +; CHECK-NEXT: [[SCALED_IF_DENORMAL:%.*]] = fmul nnan float [[X]], [[SCALED_X]] ; CHECK-NEXT: ret float [[SCALED_IF_DENORMAL]] ; %x.is.zero = fcmp oeq float %x, 0.0 diff --git a/llvm/test/Transforms/InstCombine/rem.ll b/llvm/test/Transforms/InstCombine/rem.ll index 4262ef85553b6..4f7687aeaf8bc 100644 --- a/llvm/test/Transforms/InstCombine/rem.ll +++ b/llvm/test/Transforms/InstCombine/rem.ll @@ -997,7 +997,8 @@ define <2 x i8> @urem_constant_dividend_select_of_constants_divisor_vec(i1 %b) { define <2 x i8> @urem_constant_dividend_select_of_constants_divisor_vec_ub1(i1 %b) { ; CHECK-LABEL: @urem_constant_dividend_select_of_constants_divisor_vec_ub1( -; CHECK-NEXT: ret <2 x i8> +; CHECK-NEXT: [[R:%.*]] = select i1 [[B:%.*]], <2 x i8> , <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = select i1 %b, <2 x i8> , <2 x i8> %r = urem <2 x i8> , %s diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll index a88fd3cc21f1b..af8a9314a0804 100644 --- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll +++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll @@ -1890,6 +1890,21 @@ define <2 x i32> @uadd_sat_not_ugt_commute_add(<2 x i32> %x, <2 x i32> %yp) { ret <2 x i32> %r } +define <2 x i32> @uadd_sat_not_ugt_commute_add_partial_poison(<2 x i32> %x, <2 x i32> %yp) { +; CHECK-LABEL: @uadd_sat_not_ugt_commute_add_partial_poison( +; CHECK-NEXT: [[NOTX:%.*]] = xor <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[A:%.*]] = add nuw <2 x i32> [[YP:%.*]], [[NOTX]] +; CHECK-NEXT: [[C:%.*]] = icmp ugt <2 x i32> [[YP]], [[X]] +; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[C]], <2 x i32> , <2 x i32> [[A]] +; CHECK-NEXT: ret <2 x i32> [[R]] +; + %notx = xor <2 x i32> %x, + %a = add nuw <2 x i32> %yp, %notx + %c = icmp ugt <2 x i32> %yp, %x + %r = select <2 x i1> %c, <2 x i32> , <2 x i32> %a + ret <2 x i32> %r +} + define i32 @uadd_sat_not_commute_select(i32 %x, i32 %y) { ; CHECK-LABEL: @uadd_sat_not_commute_select( ; CHECK-NEXT: [[NOTX:%.*]] = xor i32 [[X:%.*]], -1 diff --git a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll index e5ad312bb85c1..253bc9e784c2f 100644 --- a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll +++ b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll @@ -48,7 +48,7 @@ define float @select_nnan_fadd_swapped(i1 %cond, float %A, float %B) { define float @select_nnan_fadd_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fadd_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00 -; CHECK-NEXT: [[D:%.*]] = fadd fast float [[A:%.*]], [[C]] +; CHECK-NEXT: [[D:%.*]] = fadd reassoc nnan arcp contract afn float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fadd fast float %A, %B @@ -59,7 +59,7 @@ define float @select_nnan_fadd_fast_math(i1 %cond, float %A, float %B) { define float @select_nnan_fadd_swapped_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fadd_swapped_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]] -; CHECK-NEXT: [[D:%.*]] = fadd fast float [[A:%.*]], [[C]] +; CHECK-NEXT: [[D:%.*]] = fadd reassoc nnan arcp contract afn float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fadd fast float %A, %B @@ -125,7 +125,7 @@ define float @select_nnan_fmul_swapped(i1 %cond, float %A, float %B) { define float @select_nnan_fmul_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fmul_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[D:%.*]] = fmul fast float [[A:%.*]], [[C]] +; CHECK-NEXT: [[D:%.*]] = fmul reassoc nnan arcp contract afn float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fmul fast float %A, %B @@ -136,7 +136,7 @@ define float @select_nnan_fmul_fast_math(i1 %cond, float %A, float %B) { define float @select_nnan_fmul_swapped_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fmul_swapped_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]] -; CHECK-NEXT: [[D:%.*]] = fmul fast float [[A:%.*]], [[C]] +; CHECK-NEXT: [[D:%.*]] = fmul reassoc nnan arcp contract afn float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fmul fast float %A, %B @@ -169,7 +169,7 @@ define float @select_nnan_fsub_swapped(i1 %cond, float %A, float %B) { define float @select_nnan_fsub_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fsub_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00 -; CHECK-NEXT: [[D:%.*]] = fsub fast float [[A:%.*]], [[C]] +; CHECK-NEXT: [[D:%.*]] = fsub reassoc nnan arcp contract afn float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fsub fast float %A, %B @@ -180,7 +180,7 @@ define float @select_nnan_fsub_fast_math(i1 %cond, float %A, float %B) { define float @select_nnan_fsub_swapped_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fsub_swapped_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]] -; CHECK-NEXT: [[D:%.*]] = fsub fast float [[A:%.*]], [[C]] +; CHECK-NEXT: [[D:%.*]] = fsub reassoc nnan arcp contract afn float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fsub fast float %A, %B @@ -247,7 +247,7 @@ define float @select_nnan_fdiv_swapped(i1 %cond, float %A, float %B) { define float @select_nnan_fdiv_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fdiv_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00 -; CHECK-NEXT: [[D:%.*]] = fdiv fast float [[A:%.*]], [[C]] +; CHECK-NEXT: [[D:%.*]] = fdiv reassoc nnan arcp contract afn float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fdiv fast float %A, %B @@ -258,7 +258,7 @@ define float @select_nnan_fdiv_fast_math(i1 %cond, float %A, float %B) { define float @select_nnan_fdiv_swapped_fast_math(i1 %cond, float %A, float %B) { ; CHECK-LABEL: @select_nnan_fdiv_swapped_fast_math( ; CHECK-NEXT: [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]] -; CHECK-NEXT: [[D:%.*]] = fdiv fast float [[A:%.*]], [[C]] +; CHECK-NEXT: [[D:%.*]] = fdiv reassoc nnan arcp contract afn float [[A:%.*]], [[C]] ; CHECK-NEXT: ret float [[D]] ; %C = fdiv fast float %A, %B diff --git a/llvm/test/Transforms/InstCombine/vector-udiv.ll b/llvm/test/Transforms/InstCombine/vector-udiv.ll index c817b3a1ac5a0..0289b7c70cc4f 100644 --- a/llvm/test/Transforms/InstCombine/vector-udiv.ll +++ b/llvm/test/Transforms/InstCombine/vector-udiv.ll @@ -97,3 +97,16 @@ define <4 x i32> @test_v4i32_zext_shl_const_pow2(<4 x i32> %a0, <4 x i16> %a1) { %3 = udiv <4 x i32> %a0, %2 ret <4 x i32> %3 } + +; Make sure we do not simplify udiv , to +; poison when threading udiv over selects + +define <2 x i32> @vec_select_udiv_poison(<2 x i1> %x) { +; CHECK-LABEL: @vec_select_udiv_poison( +; CHECK-NEXT: [[DIV:%.*]] = select <2 x i1> [[X:%.*]], <2 x i32> zeroinitializer, <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[DIV]] +; + %sel = select <2 x i1> %x, <2 x i32> , <2 x i32> + %div = udiv <2 x i32> , %sel + ret <2 x i32> %div +} diff --git a/llvm/test/Transforms/InstSimplify/div.ll b/llvm/test/Transforms/InstSimplify/div.ll index 5ca2e8837b924..e2bc121aee457 100644 --- a/llvm/test/Transforms/InstSimplify/div.ll +++ b/llvm/test/Transforms/InstSimplify/div.ll @@ -29,7 +29,7 @@ define <2 x i32> @zero_dividend_vector_poison_elt(<2 x i32> %A) { define <2 x i8> @sdiv_zero_elt_vec_constfold(<2 x i8> %x) { ; CHECK-LABEL: @sdiv_zero_elt_vec_constfold( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: ret <2 x i8> ; %div = sdiv <2 x i8> , ret <2 x i8> %div @@ -37,7 +37,7 @@ define <2 x i8> @sdiv_zero_elt_vec_constfold(<2 x i8> %x) { define <2 x i8> @udiv_zero_elt_vec_constfold(<2 x i8> %x) { ; CHECK-LABEL: @udiv_zero_elt_vec_constfold( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: ret <2 x i8> ; %div = udiv <2 x i8> , ret <2 x i8> %div @@ -45,7 +45,8 @@ define <2 x i8> @udiv_zero_elt_vec_constfold(<2 x i8> %x) { define <2 x i8> @sdiv_zero_elt_vec(<2 x i8> %x) { ; CHECK-LABEL: @sdiv_zero_elt_vec( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: [[DIV:%.*]] = sdiv <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i8> [[DIV]] ; %div = sdiv <2 x i8> %x, ret <2 x i8> %div @@ -53,7 +54,8 @@ define <2 x i8> @sdiv_zero_elt_vec(<2 x i8> %x) { define <2 x i8> @udiv_zero_elt_vec(<2 x i8> %x) { ; CHECK-LABEL: @udiv_zero_elt_vec( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: [[DIV:%.*]] = udiv <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i8> [[DIV]] ; %div = udiv <2 x i8> %x, ret <2 x i8> %div @@ -61,7 +63,8 @@ define <2 x i8> @udiv_zero_elt_vec(<2 x i8> %x) { define <2 x i8> @sdiv_poison_elt_vec(<2 x i8> %x) { ; CHECK-LABEL: @sdiv_poison_elt_vec( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: [[DIV:%.*]] = sdiv <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i8> [[DIV]] ; %div = sdiv <2 x i8> %x, ret <2 x i8> %div @@ -69,7 +72,8 @@ define <2 x i8> @sdiv_poison_elt_vec(<2 x i8> %x) { define <2 x i8> @udiv_poison_elt_vec(<2 x i8> %x) { ; CHECK-LABEL: @udiv_poison_elt_vec( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: [[DIV:%.*]] = udiv <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i8> [[DIV]] ; %div = udiv <2 x i8> %x, ret <2 x i8> %div diff --git a/llvm/test/Transforms/InstSimplify/rem.ll b/llvm/test/Transforms/InstSimplify/rem.ll index aceb7cb12185d..5ec803c6d0481 100644 --- a/llvm/test/Transforms/InstSimplify/rem.ll +++ b/llvm/test/Transforms/InstSimplify/rem.ll @@ -29,7 +29,7 @@ define <2 x i32> @zero_dividend_vector_poison_elt(<2 x i32> %A) { define <2 x i8> @srem_zero_elt_vec_constfold(<2 x i8> %x) { ; CHECK-LABEL: @srem_zero_elt_vec_constfold( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: ret <2 x i8> ; %rem = srem <2 x i8> , ret <2 x i8> %rem @@ -37,7 +37,7 @@ define <2 x i8> @srem_zero_elt_vec_constfold(<2 x i8> %x) { define <2 x i8> @urem_zero_elt_vec_constfold(<2 x i8> %x) { ; CHECK-LABEL: @urem_zero_elt_vec_constfold( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: ret <2 x i8> ; %rem = urem <2 x i8> , ret <2 x i8> %rem @@ -45,7 +45,8 @@ define <2 x i8> @urem_zero_elt_vec_constfold(<2 x i8> %x) { define <2 x i8> @srem_zero_elt_vec(<2 x i8> %x) { ; CHECK-LABEL: @srem_zero_elt_vec( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: [[REM:%.*]] = srem <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i8> [[REM]] ; %rem = srem <2 x i8> %x, ret <2 x i8> %rem @@ -53,7 +54,8 @@ define <2 x i8> @srem_zero_elt_vec(<2 x i8> %x) { define <2 x i8> @urem_zero_elt_vec(<2 x i8> %x) { ; CHECK-LABEL: @urem_zero_elt_vec( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: [[REM:%.*]] = urem <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i8> [[REM]] ; %rem = urem <2 x i8> %x, ret <2 x i8> %rem @@ -61,7 +63,8 @@ define <2 x i8> @urem_zero_elt_vec(<2 x i8> %x) { define <2 x i8> @srem_undef_elt_vec(<2 x i8> %x) { ; CHECK-LABEL: @srem_undef_elt_vec( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: [[REM:%.*]] = srem <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i8> [[REM]] ; %rem = srem <2 x i8> %x, ret <2 x i8> %rem @@ -69,7 +72,8 @@ define <2 x i8> @srem_undef_elt_vec(<2 x i8> %x) { define <2 x i8> @urem_undef_elt_vec(<2 x i8> %x) { ; CHECK-LABEL: @urem_undef_elt_vec( -; CHECK-NEXT: ret <2 x i8> poison +; CHECK-NEXT: [[REM:%.*]] = urem <2 x i8> [[X:%.*]], +; CHECK-NEXT: ret <2 x i8> [[REM]] ; %rem = urem <2 x i8> %x, ret <2 x i8> %rem diff --git a/llvm/test/Transforms/LoopIdiom/LoongArch/popcnt.ll b/llvm/test/Transforms/LoopIdiom/LoongArch/popcnt.ll index 915a100a54f48..0994a7d9391d3 100644 --- a/llvm/test/Transforms/LoopIdiom/LoongArch/popcnt.ll +++ b/llvm/test/Transforms/LoopIdiom/LoongArch/popcnt.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=loop-idiom -mtriple=loongarch32 -mattr=+lsx -S < %s | FileCheck %s --check-prefix=CPOP -; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -mattr=+lsx -S < %s | FileCheck %s --check-prefix=CPOP ; RUN: opt -passes=loop-idiom -mtriple=loongarch32 -S < %s | FileCheck %s --check-prefix=NOCPOP -; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -S < %s | FileCheck %s --check-prefix=NOCPOP +; RUN: opt -passes=loop-idiom -mtriple=loongarch32 -mattr=+lsx -S < %s | FileCheck %s --check-prefix=CPOP +; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -S < %s | FileCheck %s --check-prefix=CPOP +; RUN: opt -passes=loop-idiom -mtriple=loongarch64 -mattr=-lsx -S < %s | FileCheck %s --check-prefix=NOCPOP ; Mostly copied from RISCV version. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index 36eee8d0c98ce..cd7662a657dfe 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -117,8 +117,8 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) { ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -175,8 +175,8 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index 61bd8c51e1605..1a4ed0f21bf4c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -868,8 +868,8 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll index 9be068ce880ea..6257d3325f979 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll @@ -45,6 +45,11 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %N ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -82,6 +87,11 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %N ; CHECK-NEXT: No successors ; CHECK-NEXT: } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll index 74fd76df99259..994f2f5e37763 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll @@ -43,6 +43,11 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: IR %exitcond = icmp eq i64 %indvars.iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -80,6 +85,11 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: IR %exitcond = icmp eq i64 %indvars.iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -122,6 +132,11 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: IR %exitcond = icmp eq i64 %indvars.iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -159,6 +174,11 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: IR %exitcond = icmp eq i64 %indvars.iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -200,6 +220,11 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: IR %exitcond = icmp eq i64 %indvars.iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -237,6 +262,11 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: IR %exitcond = icmp eq i64 %indvars.iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll index e9303ec9d3eb7..afc2fd5a049ad 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -41,6 +41,11 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %cmp = icmp ne i64 %iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -78,6 +83,11 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %cmp = icmp ne i64 %iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll index fc56754166d60..b0cccf4d0a7bf 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll @@ -45,8 +45,8 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 % ; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD7]] ; CHECK-NEXT: [[TMP8:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP7]]) ; CHECK-NEXT: [[TMP9:%.*]] = fdiv fast <4 x float> [[TMP8]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[TMP9]], [[VEC_PHI]] -; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[DOTNOT9]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP10]] +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[DOTNOT9]], <4 x float> , <4 x float> [[TMP9]] +; CHECK-NEXT: [[PREDPHI]] = fadd reassoc arcp contract afn <4 x float> [[VEC_PHI]], [[TMP10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/loongarch-interleaved.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/loongarch-interleaved.ll index be9b170491b9c..5cfdd635e6883 100644 --- a/llvm/test/Transforms/LoopVectorize/LoongArch/loongarch-interleaved.ll +++ b/llvm/test/Transforms/LoopVectorize/LoongArch/loongarch-interleaved.ll @@ -3,7 +3,7 @@ ; RUN: -S < %s 2>&1 | FileCheck %s ; CHECK-LABEL: foo -; CHECK: %{{.*}} = add {{.*}}, 2 +; CHECK: %{{.*}} = add {{.*}}, 8 ; Function Attrs: nofree norecurse nosync nounwind writeonly define dso_local void @foo(i32 signext %n, ptr nocapture %A) local_unnamed_addr #0 { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 0e55ad65cdb2c..6724afd6ca10f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -410,45 +410,49 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-SAME: i64 [[N:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) -; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[SMAX]], 3 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[SMAX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 16, i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = mul [[TMP10]], shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP8]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP8]], align 1 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = zext <8 x i8> [[STRIDED_VEC4]] to <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[STRIDED_VEC5]] to <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[VEC_IND]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[STEP_ADD]] -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP11]], <8 x ptr> [[TMP13]], i32 4, <8 x i1> ) -; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP12]], <8 x ptr> [[TMP14]], i32 4, <8 x i1> ) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i8( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = zext [[TMP16]] to +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP17]], [[TMP18]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] @@ -462,9 +466,9 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[L_1]] to i32 ; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i32 [[EXT]], ptr [[GEP_DST]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 4 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 2 ; CHECK-NEXT: [[EC:%.*]] = icmp slt i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -481,7 +485,7 @@ loop: %ext = zext i8 %l.1 to i32 %gep.dst = getelementptr i32, ptr %dst, i64 %iv store i32 %ext, ptr %gep.dst, align 4 - %iv.next = add nsw i64 %iv, 4 + %iv.next = add nsw i64 %iv, 2 %ec = icmp slt i64 %iv, %N br i1 %ec, label %loop, label %exit diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll index f4dfdacac1b32..dd2e75f1f5e21 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll @@ -24,8 +24,8 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3 ; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll index fa346b4eac02d..6477f14e3c698 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll @@ -6,26 +6,26 @@ define void @i8_factor_2(ptr %data, i64 %n) { entry: br label %for.body ; CHECK-LABEL: Checking a loop in 'i8_factor_2' -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 3 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 5 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 5 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 2 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 2 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 2 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 3 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 5 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 5 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 3 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 3 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 3 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 3 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 3 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 3 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 4 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 4 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; CHECK: Cost of 8 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; CHECK: Cost of 8 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0 @@ -49,16 +49,16 @@ define void @i8_factor_3(ptr %data, i64 %n) { entry: br label %for.body ; CHECK-LABEL: Checking a loop in 'i8_factor_3' -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; CHECK: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; CHECK: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; CHECK: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; CHECK: Cost of 9 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 9 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; CHECK: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; CHECK: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; CHECK: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; CHECK: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; CHECK: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; CHECK: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; CHECK: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; CHECK: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0 @@ -86,16 +86,16 @@ define void @i8_factor_4(ptr %data, i64 %n) { entry: br label %for.body ; CHECK-LABEL: Checking a loop in 'i8_factor_4' -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; CHECK: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; CHECK: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; CHECK: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; CHECK: Cost of 9 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 9 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; CHECK: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; CHECK: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; CHECK: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; CHECK: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; CHECK: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; CHECK: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; CHECK: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; CHECK: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; CHECK: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; CHECK: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0 @@ -127,14 +127,14 @@ define void @i8_factor_5(ptr %data, i64 %n) { entry: br label %for.body ; CHECK-LABEL: Checking a loop in 'i8_factor_5' -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; CHECK: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; CHECK: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; CHECK: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; CHECK: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; CHECK: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; CHECK: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; CHECK: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; CHECK: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; CHECK: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; CHECK: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.5, ptr %data, i64 %i, i32 0 @@ -170,14 +170,14 @@ define void @i8_factor_6(ptr %data, i64 %n) { entry: br label %for.body ; CHECK-LABEL: Checking a loop in 'i8_factor_6' -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; CHECK: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; CHECK: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; CHECK: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; CHECK: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; CHECK: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; CHECK: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; CHECK: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; CHECK: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; CHECK: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; CHECK: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; CHECK: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; CHECK: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.6, ptr %data, i64 %i, i32 0 @@ -217,14 +217,14 @@ define void @i8_factor_7(ptr %data, i64 %n) { entry: br label %for.body ; CHECK-LABEL: Checking a loop in 'i8_factor_7' -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; CHECK: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; CHECK: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; CHECK: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; CHECK: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; CHECK: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; CHECK: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; CHECK: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; CHECK: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; CHECK: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; CHECK: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.7, ptr %data, i64 %i, i32 0 @@ -268,14 +268,14 @@ define void @i8_factor_8(ptr %data, i64 %n) { entry: br label %for.body ; CHECK-LABEL: Checking a loop in 'i8_factor_8' -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; CHECK: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; CHECK: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; CHECK: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; CHECK: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; CHECK: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; CHECK: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; CHECK: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; CHECK: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; CHECK: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; CHECK: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; CHECK: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; CHECK: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; CHECK: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.8, ptr %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index c7bb1ffab23e7..a38835f5613fd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -96,6 +96,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] +; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] @@ -181,6 +187,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] +; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Loop does not require scalar epilogue @@ -303,6 +315,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] +; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] @@ -388,6 +406,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] +; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] +; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: LV: Loop does not require scalar epilogue diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 59db6c197ef8c..77a9d105c85f3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -66,9 +66,13 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: scalar.ph: ; IF-EVL-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> -; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb ; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> +; IF-EVL-INLOOP-NEXT: ir-bb: +; IF-EVL-INLOOP-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; IF-EVL-INLOOP-NEXT: IR %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] +; IF-EVL-INLOOP: IR %exitcond.not = icmp eq i64 %iv.next, %n +; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-NEXT: } ; @@ -108,9 +112,13 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: scalar.ph: ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> -; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb ; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> +; NO-VP-OUTLOOP-NEXT: ir-bb: +; NO-VP-OUTLOOP-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; NO-VP-OUTLOOP-NEXT: IR %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] +; NO-VP-OUTLOOP: IR %exitcond.not = icmp eq i64 %iv.next, %n +; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-NEXT: } ; @@ -150,9 +158,13 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: scalar.ph: ; NO-VP-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> -; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-NEXT: Successor(s): ir-bb ; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> +; NO-VP-INLOOP-NEXT: ir-bb: +; NO-VP-INLOOP-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; NO-VP-INLOOP-NEXT: IR %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] +; NO-VP-INLOOP: IR %exitcond.not = icmp eq i64 %iv.next, %n +; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 8dca8302e8714..c0098eb533c00 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -390,8 +390,8 @@ define i16 @iv_and_step_trunc() { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll index fe48008792ff7..bcacfb358ec05 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll @@ -45,10 +45,14 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22> ; CHECK-NEXT: EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_1_P]]> -; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]>.1 +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>) +; CHECK-NEXT: IR %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1) +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %exitcond.not = icmp eq i64 %iv.next, 1000 +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -118,11 +122,15 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22> ; CHECK-NEXT: EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33> ; CHECK-NEXT: EMIT vp<[[RESUME_3_P:%.*]]>.2 = resume-phi vp<[[RESUME_3]]>.2, ir<33> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_1_P]]> -; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]>.1 -; CHECK-NEXT: Live-out i16 %for.3 = vp<[[RESUME_3_P]]>.2 +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>) +; CHECK-NEXT: IR %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1) +; CHECK-NEXT: IR %for.3 = phi i16 [ 33, %entry ], [ %for.2, %loop ] (extra operand: vp<[[RESUME_3_P]]>.2) +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %exitcond.not = icmp eq i64 %iv.next, 1000 +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -193,10 +201,13 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) { ; CHECK-NEXT: scalar.ph: ; CHECK-NEXT: EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0> ; CHECK-NEXT: EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i64 %for.x = vp<[[RESUME_X]]> -; CHECK-NEXT: Live-out i32 %for.y = vp<[[RESUME_Y]]>.1 +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] +; CHECK-NEXT: IR %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]>) +; CHECK-NEXT: IR %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1) +; CHECK: No successors ; CHECK-NEXT: } ; entry: @@ -264,10 +275,13 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) { ; CHECK-NEXT: scalar.ph: ; CHECK-NEXT: EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0> ; CHECK-NEXT: EMIT vp<[[RESUME_Y:%.+]]>.1 = resume-phi vp<[[EXT_Y]]>.1, ir<0> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i64 %for.x = vp<[[RESUME_X]]> -; CHECK-NEXT: Live-out i32 %for.y = vp<[[RESUME_Y]]>.1 +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] +; CHECK-NEXT: IR %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]>) +; CHECK-NEXT: IR %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1) +; CHECK: No successors ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll index 1e34e1d0d517d..b0ece3980cdf2 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll @@ -18,10 +18,10 @@ define i16 @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; entry: br label %loop @@ -61,10 +61,10 @@ define i16 @test_chained_first_order_recurrences_2(ptr %ptr) { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; entry: br label %loop @@ -107,12 +107,12 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; entry: br label %loop @@ -219,12 +219,12 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) { ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; entry: br label %loop @@ -270,12 +270,12 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) { ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; entry: br label %loop @@ -321,12 +321,12 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr) ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; entry: br label %loop @@ -371,12 +371,12 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr % ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; entry: br label %loop @@ -420,10 +420,10 @@ define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) { ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996 ; CHECK-NEXT: br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 ; entry: br label %loop @@ -488,8 +488,8 @@ define i64 @test_first_order_recurrences_and_induction(ptr %ptr) { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 ; CHECK-NEXT: br i1 true entry: @@ -528,8 +528,8 @@ define i64 @test_first_order_recurrences_and_induction2(ptr %ptr) { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 ; CHECK-NEXT: br i1 true ; entry: @@ -568,8 +568,8 @@ define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3 ; CHECK-NEXT: br i1 true ; entry: @@ -611,8 +611,8 @@ define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP5]], label %middle.block, label %vector.body ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3 ; CHECK-NEXT: br i1 true ; entry: @@ -657,8 +657,8 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 ; CHECK-NEXT: br i1 true, label %End, label %scalar.ph ; CHECK: scalar.ph: ; CHECK-NEXT: %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index b05980bef1b38..8ae538cf63986 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -82,9 +82,13 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %0 = vp<[[RESUME_1_P]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]>) +; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %ec = icmp eq i32 %iv.next, 20001 +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -165,9 +169,13 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>) +; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %ec = icmp eq i32 %iv.next, 20001 +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -233,10 +241,14 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> ; CHECK-NEXT: EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> -; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>) +; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: IR %and.red = phi i32 [ 1234, %entry ], [ %and.red.next, %loop ] +; CHECK: IR %ec = icmp eq i32 %iv.next, 20001 +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -340,9 +352,13 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %0 = vp<[[RESUME_1_P]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]>) +; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %ec = icmp eq i32 %iv.next, 20001 +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -433,9 +449,13 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>) +; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %C = icmp sgt i32 %iv.next, %recur.next +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -515,9 +535,13 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %.pn = vp<[[RESUME_1_P]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: IR %.pn = phi i32 [ 0, %entry ], [ %l, %loop ] (extra operand: vp<[[RESUME_1_P]]>) +; CHECK: IR %ec = icmp ugt i64 %iv, 3 +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index c4e3e0b8c5a36..a90594085d3cf 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -907,8 +907,8 @@ define i32 @PR27246() { ; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2 +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3 ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: @@ -1000,8 +1000,8 @@ define i32 @PR27246() { ; SINK-AFTER-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SINK-AFTER: middle.block: -; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2 +; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 ; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]] ; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] ; SINK-AFTER: scalar.ph: @@ -1358,8 +1358,8 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) { ; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96 ; UNROLL-NO-IC-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1427,8 +1427,8 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) { ; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96 ; SINK-AFTER-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; SINK-AFTER: middle.block: -; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 ; SINK-AFTER-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SINK-AFTER: scalar.ph: ; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1555,8 +1555,8 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) { ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY:%.*]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP43]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.cond.cleanup: ; UNROLL-NO-IC-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ] @@ -1615,8 +1615,8 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) { ; UNROLL-NO-VF: scalar.ph: ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY:%.*]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: ; UNROLL-NO-VF-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] @@ -1684,8 +1684,8 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) { ; SINK-AFTER: scalar.ph: ; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY:%.*]] ] ; SINK-AFTER-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ] ; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ] ; SINK-AFTER-NEXT: br label [[FOR_BODY:%.*]] ; SINK-AFTER: for.cond.cleanup: ; SINK-AFTER-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] @@ -3437,8 +3437,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; UNROLL-NO-IC-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -3532,8 +3532,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; SINK-AFTER-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; SINK-AFTER-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; SINK-AFTER: middle.block: -; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 +; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; SINK-AFTER-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SINK-AFTER: scalar.ph: ; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index b061cf526b780..8bdba25b1b761 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -5563,8 +5563,8 @@ define i64 @trunc_with_first_order_recurrence() { ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] @@ -5625,8 +5625,8 @@ define i64 @trunc_with_first_order_recurrence() { ; IND-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i64 1 ; IND-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ] -; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ] +; IND-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ] +; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: exit: ; IND-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ poison, [[MIDDLE_BLOCK]] ] @@ -5703,8 +5703,8 @@ define i64 @trunc_with_first_order_recurrence() { ; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD7]], i64 1 ; UNROLL-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ] +; UNROLL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ] +; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: exit: ; UNROLL-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ poison, [[MIDDLE_BLOCK]] ] @@ -5783,8 +5783,8 @@ define i64 @trunc_with_first_order_recurrence() { ; UNROLL-NO-IC: scalar.ph: ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: exit: ; UNROLL-NO-IC-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] @@ -5861,8 +5861,8 @@ define i64 @trunc_with_first_order_recurrence() { ; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i64 3 ; INTERLEAVE-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: exit: ; INTERLEAVE-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ poison, [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll index b7f8ddbfa5d7c..a71666d8c3167 100644 --- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll @@ -116,6 +116,12 @@ declare i32 @llvm.smin.i32(i32, i32) ; DBG-NEXT: No successors ; DBG-EMPTY: ; DBG-NEXT: scalar.ph: +; DBG-NEXT: Successor(s): ir-bb +; DBG-EMPTY: +; DBG-NEXT: ir-bb: +; DBG-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] +; DBG-NEXT: IR %d = phi i1 [ false, %entry ], [ %d.next, %loop.latch ] +; DBG-NEXT: IR %d.next = xor i1 %d, true ; DBG-NEXT: No successors ; DBG-NEXT: } @@ -217,9 +223,13 @@ exit: ; DBG-EMPTY: ; DBG-NEXT: scalar.ph: ; DBG-NEXT: EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; DBG-NEXT: No successors +; DBG-NEXT: Successor(s): ir-bb ; DBG-EMPTY: -; DBG-NEXT: Live-out i32 %for = vp<[[RESUME_P]]> +; DBG-NEXT: ir-bb: +; DBG-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; DBG-NEXT: IR %for = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ] (extra operand: vp<[[RESUME_P]]>) +; DBG: IR %ec = icmp slt i32 %iv.next.trunc, %n +; DBG-NEXT: No successors ; DBG-NEXT: } define void @first_order_recurrence_using_induction(i32 %n, ptr %dst) { diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll index 6fd5d979724fc..fb174870ed95b 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll @@ -23,14 +23,14 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) { ; CHECK-VF4UF1: %[[LOAD]] = load , ptr ; CHECK-VF4UF1: %[[SPLICE:.*]] = call @llvm.vector.splice.nxv4i32( %[[VEC_RECUR]], %[[LOAD]], i32 -1) ; CHECK-VF4UF1: middle.block: -; CHECK-VF4UF1: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32() -; CHECK-VF4UF1: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4 -; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL3]], 1 -; CHECK-VF4UF1: %[[VEC_RECUR_EXT:.*]] = extractelement %[[LOAD]], i32 %[[SUB3]] ; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32() ; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4 ; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL2]], 2 ; CHECK-VF4UF1: %[[VEC_RECUR_FOR_PHI:.*]] = extractelement %[[LOAD]], i32 %[[SUB3]] +; CHECK-VF4UF1: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32() +; CHECK-VF4UF1: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4 +; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL3]], 1 +; CHECK-VF4UF1: %[[VEC_RECUR_EXT:.*]] = extractelement %[[LOAD]], i32 %[[SUB3]] entry: br label %for.preheader @@ -207,14 +207,14 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) { ; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[ADD2:.*]], %vector.body ] ; CHECK-VF4UF2: %[[ADD1:.*]] = add %{{.*}}, %[[SPLAT1]] ; CHECK-VF4UF2: middle.block -; CHECK-VF4UF2: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32() -; CHECK-VF4UF2: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4 -; CHECK-VF4UF2: %[[SUB2:.*]] = sub i32 %[[MUL3]], 1 -; CHECK-VF4UF2: %vector.recur.extract = extractelement %[[ADD2]], i32 %[[SUB2]] ; CHECK-VF4UF2: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32() ; CHECK-VF4UF2: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4 ; CHECK-VF4UF2: %[[SUB3:.*]] = sub i32 %[[MUL2]], 2 ; CHECK-VF4UF2: %vector.recur.extract.for.phi = extractelement %[[ADD2]], i32 %[[SUB3]] +; CHECK-VF4UF2: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32() +; CHECK-VF4UF2: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4 +; CHECK-VF4UF2: %[[SUB2:.*]] = sub i32 %[[MUL3]], 1 +; CHECK-VF4UF2: %vector.recur.extract = extractelement %[[ADD2]], i32 %[[SUB2]] entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll index cab784b61c544..1f815899ed55c 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll @@ -35,6 +35,11 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next.p, %loop.latch ] +; CHECK: IR %iv.next = add i64 %iv, 1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll index a1e28999a4002..53f5a5658fb68 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll @@ -91,6 +91,11 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index 81c56f7590079..c9612ced3eee0 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -46,6 +46,12 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: IR %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ] +; CHECK: IR %cmp = icmp eq i64 %iv.next, 0 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -93,6 +99,12 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: IR %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ] +; CHECK: IR %cmp = icmp eq i64 %iv.next, 0 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll index 8c7a4e57f9d35..50d406d0c0416 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll @@ -51,6 +51,13 @@ define void @foo(i64 %n) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ] +; CHECK-NEXT: IR %gep.1 = getelementptr inbounds [8 x i64], ptr @arr2, i64 0, i64 %outer.iv +; CHECK-NEXT: IR store i64 %outer.iv, ptr %gep.1, align 4 +; CHECK-NEXT: IR %add = add nsw i64 %outer.iv, %n ; CHECK-NEXT: No successors ; CHECK-NEXT: } entry: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 2247295295663..6bb20a301e0ad 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -42,6 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -104,6 +109,11 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x, ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -166,9 +176,12 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -225,9 +238,13 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] +; CHECK-NEXT: IR %red = phi float [ %red.next, %for.body ], [ 0.000000e+00, %entry ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: @@ -306,6 +323,11 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] +; CHECK-NEXT: IR %cmp = icmp ult i64 %i, 5 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -378,6 +400,11 @@ define void @print_interleave_groups(i32 %C, i32 %D) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK: IR %cmp = icmp slt i64 %iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -454,9 +481,13 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK-NEXT: IR %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ] +; CHECK: IR %exitcond.not = icmp eq i64 %iv.next, %n +; CHECK-NEXT: No successors ; CHECK-NEXT:} entry: @@ -538,7 +569,12 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-NEXT: ir-bb ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ] +; CHECK: IR %cmp1 = icmp slt i32 %lsd, 100 ; CHECK-NEXT: No successors ; CHECK-NEXT:} ; @@ -619,6 +655,11 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] +; CHECK: IR %iv.next = add i64 %iv, %inc ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -679,6 +720,11 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %ec = icmp eq i32 %iv.next, 1000 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -737,6 +783,11 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr % ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -797,6 +848,11 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -877,6 +933,11 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ] +; CHECK: IR %ifcond = fcmp oeq float %ld.value, 5.0 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -946,6 +1007,11 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -1038,8 +1104,8 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1> ; CHECK-NEXT: EMIT vp<[[FOR_RESULT:%.+]]> = extract-from-end ir<%for.1.next>, ir<2> +; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -1050,9 +1116,13 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22> -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_P]]> +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_P]]>) +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK: IR %exitcond.not = icmp eq i64 %iv.next, 1000 +; CHECK-NEXT: No successors ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll index f846ba0166b2c..cdeffeff84d03 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll @@ -53,6 +53,12 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] +; CHECK-NEXT: IR %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] +; CHECK: IR %tmp5 = trunc i32 %tmp4 to i8 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 446b720ad1ba4..88e7aaccfe2f3 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -1077,6 +1077,17 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: IR %iv.next = add nsw i32 %iv, -1 +; CHECK-NEXT: IR %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv +; CHECK-NEXT: IR %l = load i32, ptr %gep.src, align 16 +; CHECK-NEXT: IR %dead_gep = getelementptr inbounds i32, ptr %dst, i64 1 +; CHECK-NEXT: IR %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv +; CHECK-NEXT: IR store i32 %l, ptr %gep.dst, align 16 +; CHECK-NEXT: IR %ec = icmp eq i32 %iv.next, 0 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -1157,6 +1168,13 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] +; CHECK-NEXT: IR %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 -1 +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv.next, align 1 +; CHECK-NEXT: IR %c.1 = icmp eq i8 %l, 0 ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll new file mode 100644 index 0000000000000..0c5324ee96c93 --- /dev/null +++ b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll @@ -0,0 +1,54 @@ +;; Check that if the address of a weak function is only taken through an alias, +;; it is still added to a list of exported functions and @llvm.type.test() is +;; lowered to an actual check against the generated CFI jumptable. + +RUN: rm -rf %t.dir && split-file %s %t.dir && cd %t.dir +RUN: opt test.ll --thinlto-bc --thinlto-split-lto-unit -o test.bc +RUN: llvm-modextract test.bc -n 0 -o test0.bc +RUN: llvm-modextract test.bc -n 1 -o test1.bc + +;; Check that a CFI jumptable is generated. +RUN: opt test1.bc -passes=lowertypetests -lowertypetests-read-summary=in.yaml \ +RUN: -lowertypetests-summary-action=export -lowertypetests-write-summary=exported.yaml \ +RUN: -S -o - | FileCheck %s --check-prefix=REGULAR +REGULAR: @__typeid__ZTSFvvE_global_addr = hidden alias i8, ptr @.cfi.jumptable +REGULAR: @f = alias void (), ptr @.cfi.jumptable +REGULAR: define private void @.cfi.jumptable() + +;; CHECK that @llvm.type.test() is lowered to an actual check. +RUN: opt test0.bc -passes=lowertypetests -lowertypetests-read-summary=exported.yaml \ +RUN: -lowertypetests-summary-action=import -S -o - | FileCheck %s --check-prefix=THIN +THIN: define i1 @test() { +THIN-NEXT: %1 = icmp eq i64 ptrtoint (ptr @alias to i64), ptrtoint (ptr @__typeid__ZTSFvvE_global_addr to i64) +THIN-NEXT: ret i1 %1 +THIN-NEXT: } + +;--- test.ll +target triple = "x86_64-pc-linux-gnu" + +@alias = alias void(), ptr @f + +define weak void @f() !type !0 { + ret void +} + +define i1 @test() { + %1 = call i1 @llvm.type.test(ptr nonnull @alias, metadata !"_ZTSFvvE") + ret i1 %1 +} + +declare i1 @llvm.type.test(ptr, metadata) + +!0 = !{i64 0, !"_ZTSFvvE"} +;--- in.yaml +--- +GlobalValueMap: + 8346051122425466633: # guid("test") + - Live: true + Refs: [5833419078793185394] # guid("alias") + TypeTests: [9080559750644022485] # guid("_ZTSFvvE") + 5833419078793185394: # guid("alias") + - Aliasee: 14740650423002898831 # guid("f") + 14740650423002898831: # guid("f") + - +... diff --git a/llvm/test/Transforms/LowerTypeTests/drop_type_test.ll b/llvm/test/Transforms/LowerTypeTests/drop_type_test.ll new file mode 100644 index 0000000000000..e1d0573924a4e --- /dev/null +++ b/llvm/test/Transforms/LowerTypeTests/drop_type_test.ll @@ -0,0 +1,22 @@ +; RUN: opt -S -passes=lowertypetests -lowertypetests-drop-type-tests=all < %s | FileCheck %s + +define void @func() { +entry: + %0 = tail call i1 @llvm.type.test(ptr null, metadata !"foo") + br i1 %0, label %exit, label %trap + +trap: + unreachable + +exit: + ret void + ; CHECK-LABEL: entry: + ; CHECK-NEXT: br i1 true, label %exit, label %trap + ; CHECK-LABEL: trap: + ; CHECK-NEXT: unreachable + ; CHECK-LABEL: exit: + ; CHECK-NEXT: ret void +} + +declare i1 @llvm.type.test(ptr, metadata) #0 +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/Transforms/LowerTypeTests/drop_type_test_phi.ll b/llvm/test/Transforms/LowerTypeTests/drop_type_test_phi.ll index 3cf4d447605da..820865826dc7a 100644 --- a/llvm/test/Transforms/LowerTypeTests/drop_type_test_phi.ll +++ b/llvm/test/Transforms/LowerTypeTests/drop_type_test_phi.ll @@ -1,5 +1,5 @@ ; Test to ensure dropping of type tests can handle a phi feeding the assume. -; RUN: opt -S -passes=lowertypetests -lowertypetests-drop-type-tests -mtriple=x86_64-unknown-linux-gnu %s | FileCheck %s +; RUN: opt -S -passes=lowertypetests -lowertypetests-drop-type-tests=assume -mtriple=x86_64-unknown-linux-gnu %s | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-grtev4-linux-gnu" diff --git a/llvm/test/Transforms/NewGVN/pr113997.ll b/llvm/test/Transforms/NewGVN/pr113997.ll new file mode 100644 index 0000000000000..a919c8c304b1b --- /dev/null +++ b/llvm/test/Transforms/NewGVN/pr113997.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=newgvn < %s | FileCheck %s + +; Make sure attributes in function calls are intersected correctly. + +define i1 @bucket(i32 noundef %x) { +; CHECK-LABEL: define i1 @bucket( +; CHECK-SAME: i32 noundef [[X:%.*]]) { +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[X]], 0 +; CHECK-NEXT: [[CTPOP1:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]]) +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ult i32 [[CTPOP1]], 2 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 false +; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CTPOP1]], 1 +; CHECK-NEXT: ret i1 [[RES]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: ret i1 false +; + %cmp1 = icmp sgt i32 %x, 0 + %ctpop1 = tail call range(i32 1, 32) i32 @llvm.ctpop.i32(i32 %x) + %cmp2 = icmp samesign ult i32 %ctpop1, 2 + %cond = select i1 %cmp1, i1 %cmp2, i1 false + br i1 %cond, label %if.then, label %if.else + +if.else: + %ctpop2 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %x) + %res = icmp eq i32 %ctpop2, 1 + ret i1 %res + +if.then: + ret i1 false +} diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll index 7274e95256769..6e9256afc7a8d 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll @@ -12,26 +12,79 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] ; CHECK: [[FOR_BODY_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[RAND_BLOCK_LENGTH]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[RAND_BLOCK_LENGTH]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER23:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <2 x double> poison, double [[Z]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT19]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ , %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <2 x double> [ , %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <2 x double> [ , %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <2 x double> [ , %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX1]], i64 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x float>, ptr [[TMP23]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> +; CHECK-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[WIDE_LOAD18]] to <2 x double> +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[BROADCAST_SPLAT]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[BROADCAST_SPLAT]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP4]], [[BROADCAST_SPLAT20]] +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[BROADCAST_SPLAT20]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp fast ogt <2 x double> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fcmp fast ogt <2 x double> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x double> [[TMP7]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = tail call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP6]], <2 x double> ) +; CHECK-NEXT: [[TMP13:%.*]] = tail call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP7]], <2 x double> ) +; CHECK-NEXT: [[TMP14]] = fadd reassoc arcp contract afn <2 x double> [[VEC_PHI16]], [[TMP12]] +; CHECK-NEXT: [[TMP15]] = fadd reassoc arcp contract afn <2 x double> [[VEC_PHI17]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP8]], <2 x double> [[TMP10]], <2 x double> +; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP9]], <2 x double> [[TMP11]], <2 x double> +; CHECK-NEXT: [[TMP18]] = fadd reassoc arcp contract afn <2 x double> [[VEC_PHI]], [[TMP16]] +; CHECK-NEXT: [[TMP19]] = fadd reassoc arcp contract afn <2 x double> [[VEC_PHI15]], [[TMP17]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <2 x double> [[TMP19]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX21:%.*]] = fadd reassoc arcp contract afn <2 x double> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX21]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER23]] +; CHECK: [[FOR_BODY_PREHEADER23]]: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V1_012_PH:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[TMP21]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V0_011_PH:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[V1_011:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[V1_1:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[V0_010:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[V0_1:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER23]] ] +; CHECK-NEXT: [[V1_012:%.*]] = phi double [ [[V1_2:%.*]], %[[FOR_BODY]] ], [ [[V1_012_PH]], %[[FOR_BODY_PREHEADER23]] ] +; CHECK-NEXT: [[V0_011:%.*]] = phi double [ [[V0_2:%.*]], %[[FOR_BODY]] ], [ [[V0_011_PH]], %[[FOR_BODY_PREHEADER23]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CONV:%.*]] = fpext float [[TMP0]] to double ; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]] ; CHECK-NEXT: [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]] ; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast ogt double [[SUB]], 0.000000e+00 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[SUB]], [[V0_010]] ; CHECK-NEXT: [[MUL3:%.*]] = fmul fast double [[SUB]], [[SUB]] -; CHECK-NEXT: [[ADD4:%.*]] = fadd fast double [[MUL3]], [[V1_011]] -; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], double [[ADD]], double [[V0_010]] -; CHECK-NEXT: [[V1_1]] = select i1 [[CMP1]], double [[ADD4]], double [[V1_011]] +; CHECK-NEXT: [[ADD8:%.*]] = tail call fast double @llvm.maxnum.f64(double [[SUB]], double -0.000000e+00) +; CHECK-NEXT: [[V0_2]] = fadd reassoc arcp contract afn double [[V0_011]], [[ADD8]] +; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP1]], double [[MUL3]], double -0.000000e+00 +; CHECK-NEXT: [[V1_2]] = fadd reassoc arcp contract afn double [[V1_012]], [[ADD4]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: [[V0_1:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[V1_1:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]] ; CHECK-NEXT: br label %[[FOR_END]] ; CHECK: [[FOR_END]]: @@ -136,17 +189,72 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: br i1 [[CMP211]], label %[[FOR_BODY_US_PREHEADER:.*]], label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY_US_PREHEADER]]: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[RAND_BLOCK_LENGTH]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[RAND_BLOCK_LENGTH]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483644 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <2 x double> poison, double [[Z]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT35]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br label %[[FOR_BODY_US:.*]] ; CHECK: [[FOR_BODY_US]]: -; CHECK-NEXT: [[V1_019_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] -; CHECK-NEXT: [[V0_018_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: [[V1_021_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: [[V0_020_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] ; CHECK-NEXT: [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ] ; CHECK-NEXT: tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]]) +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY3_US_PREHEADER:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[V1_021_US]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> , double [[V0_020_US]], i64 0 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI31:%.*]] = phi <2 x double> [ , %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI32:%.*]] = phi <2 x double> [ [[TMP27]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI33:%.*]] = phi <2 x double> [ , %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX_US1:%.*]] = getelementptr inbounds float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_US1]], i64 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[ARRAYIDX_US1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <2 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double> +; CHECK-NEXT: [[TMP5:%.*]] = fpext <2 x float> [[WIDE_LOAD34]] to <2 x double> +; CHECK-NEXT: [[TMP6:%.*]] = tail call fast <2 x double> @llvm.exp2.v2f64(<2 x double> [[TMP4]]) +; CHECK-NEXT: [[TMP7:%.*]] = tail call fast <2 x double> @llvm.exp2.v2f64(<2 x double> [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x double> [[TMP6]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP7]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP8]], [[BROADCAST_SPLAT36]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP9]], [[BROADCAST_SPLAT36]] +; CHECK-NEXT: [[TMP12:%.*]] = fcmp fast ogt <2 x double> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast ogt <2 x double> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = tail call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP10]], <2 x double> ) +; CHECK-NEXT: [[TMP17:%.*]] = tail call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP11]], <2 x double> ) +; CHECK-NEXT: [[TMP18]] = fadd reassoc arcp contract afn <2 x double> [[VEC_PHI32]], [[TMP16]] +; CHECK-NEXT: [[TMP19]] = fadd reassoc arcp contract afn <2 x double> [[VEC_PHI33]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = select <2 x i1> [[TMP12]], <2 x double> [[TMP14]], <2 x double> +; CHECK-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP13]], <2 x double> [[TMP15]], <2 x double> +; CHECK-NEXT: [[TMP22]] = fadd reassoc arcp contract afn <2 x double> [[VEC_PHI]], [[TMP20]] +; CHECK-NEXT: [[TMP23]] = fadd reassoc arcp contract afn <2 x double> [[VEC_PHI31]], [[TMP21]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 4 +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <2 x double> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX37:%.*]] = fadd reassoc arcp contract afn <2 x double> [[TMP19]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX37]]) +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US_PREHEADER]] +; CHECK: [[FOR_BODY3_US_PREHEADER]]: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V1_116_US_PH:%.*]] = phi double [ [[V1_021_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V0_115_US_PH:%.*]] = phi double [ [[V0_020_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_BODY3_US:.*]] ; CHECK: [[FOR_BODY3_US]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ] -; CHECK-NEXT: [[V1_114_US:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ] -; CHECK-NEXT: [[V0_113_US:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[V0_2_US]], %[[FOR_BODY3_US]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ] +; CHECK-NEXT: [[V1_116_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V1_116_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ] +; CHECK-NEXT: [[V0_115_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V0_115_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[CONV_US:%.*]] = fpext float [[TMP0]] to double @@ -154,15 +262,17 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]] ; CHECK-NEXT: [[SUB_US:%.*]] = fsub fast double [[MUL_US]], [[Z]] ; CHECK-NEXT: [[CMP4_US:%.*]] = fcmp fast ogt double [[SUB_US]], 0.000000e+00 -; CHECK-NEXT: [[ADD_US:%.*]] = fadd fast double [[SUB_US]], [[V0_113_US]] -; CHECK-NEXT: [[MUL6_US:%.*]] = fmul fast double [[SUB_US]], [[SUB_US]] -; CHECK-NEXT: [[ADD7_US:%.*]] = fadd fast double [[MUL6_US]], [[V1_114_US]] -; CHECK-NEXT: [[V0_2_US]] = select i1 [[CMP4_US]], double [[ADD_US]], double [[V0_113_US]] -; CHECK-NEXT: [[V1_2_US]] = select i1 [[CMP4_US]], double [[ADD7_US]], double [[V1_114_US]] +; CHECK-NEXT: [[ADD7_US:%.*]] = fmul fast double [[SUB_US]], [[SUB_US]] +; CHECK-NEXT: [[ADD12_US:%.*]] = tail call fast double @llvm.maxnum.f64(double [[SUB_US]], double -0.000000e+00) +; CHECK-NEXT: [[V0_2_US]] = fadd reassoc arcp contract afn double [[V0_115_US]], [[ADD12_US]] +; CHECK-NEXT: [[ADD7_US1:%.*]] = select i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00 +; CHECK-NEXT: [[V1_2_US]] = fadd reassoc arcp contract afn double [[V1_116_US]], [[ADD7_US1]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]] +; CHECK-NEXT: br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]: +; CHECK-NEXT: [[V0_2_US_LCSSA]] = phi double [ [[TMP26]], %[[MIDDLE_BLOCK]] ], [ [[V0_2_US]], %[[FOR_BODY3_US]] ] +; CHECK-NEXT: [[V1_2_US_LCSSA]] = phi double [ [[TMP25]], %[[MIDDLE_BLOCK]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ] ; CHECK-NEXT: [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1 ; CHECK-NEXT: [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]] ; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]] @@ -173,8 +283,8 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]] ; CHECK: [[FOR_END10]]: -; CHECK-NEXT: [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] -; CHECK-NEXT: [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] +; CHECK-NEXT: [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] +; CHECK-NEXT: [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] ; CHECK-NEXT: [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]] ; CHECK-NEXT: ret double [[ADD11]] ; @@ -292,3 +402,11 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) declare void @resample(i32 noundef, ptr noundef) declare double @llvm.exp2.f64(double) declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll index 1d1c9d1f1d18c..324503a30783d 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll @@ -108,11 +108,10 @@ define <8 x float> @hadd_reverse_v8f32(<8 x float> %a, <8 x float> %b) #0 { define <8 x float> @reverse_hadd_v8f32(<8 x float> %a, <8 x float> %b) #0 { ; CHECK-LABEL: @reverse_hadd_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[SHUFFLE]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll index 4f8f04ec42497..9d3b69218313e 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll @@ -108,11 +108,10 @@ define <8 x float> @hadd_reverse_v8f32(<8 x float> %a, <8 x float> %b) #0 { define <8 x float> @reverse_hadd_v8f32(<8 x float> %a, <8 x float> %b) #0 { ; CHECK-LABEL: @reverse_hadd_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[SHUFFLE]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll index 4a024cc4c0309..53d4b1ad96cb8 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll @@ -32,10 +32,9 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { ; AVX1-NEXT: ret <4 x double> [[SHUFFLE]] ; ; AVX2-LABEL: @PR50392( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] ; AVX2-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP5]], <4 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll index 1d4cee45b6685..6ff68f50db1b7 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll @@ -16,12 +16,18 @@ define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) { ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> ; SSE-NEXT: ret <4 x double> [[TMP4]] ; -; AVX-LABEL: @PR94546( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX-NEXT: ret <4 x double> [[TMP4]] +; AVX1-LABEL: @PR94546( +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; AVX1-NEXT: ret <4 x double> [[TMP4]] +; +; AVX2-LABEL: @PR94546( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x double> [[TMP3]] ; %vecext = extractelement <4 x double> %a, i32 0 %vecext1 = extractelement <4 x double> %a, i32 1 @@ -43,5 +49,4 @@ define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) { ret <4 x double> %shuffle } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX1: {{.*}} -; AVX2: {{.*}} +; AVX: {{.*}} diff --git a/llvm/test/Transforms/SCCP/vscale-intrinsic.ll b/llvm/test/Transforms/SCCP/vscale-intrinsic.ll new file mode 100644 index 0000000000000..7dc7ea9b99191 --- /dev/null +++ b/llvm/test/Transforms/SCCP/vscale-intrinsic.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=sccp -S | FileCheck %s + +define i1 @vscale_i32_noattr() { +; CHECK-LABEL: define i1 @vscale_i32_noattr() { +; CHECK-NEXT: [[SCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[CMP2:%.*]] = icmp ule i32 [[SCALE]], 16 +; CHECK-NEXT: [[RES:%.*]] = and i1 true, [[CMP2]] +; CHECK-NEXT: ret i1 [[RES]] +; + %scale = call i32 @llvm.vscale.i32() + %cmp1 = icmp uge i32 %scale, 1 + %cmp2 = icmp ule i32 %scale, 16 + %res = and i1 %cmp1, %cmp2 + ret i1 %res +} + +define i1 @vscale_i32_attr() vscale_range(1, 16) { +; CHECK-LABEL: define i1 @vscale_i32_attr( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[SCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: ret i1 true +; + %scale = call i32 @llvm.vscale.i32() + %cmp1 = icmp uge i32 %scale, 1 + %cmp2 = icmp ule i32 %scale, 16 + %res = and i1 %cmp1, %cmp2 + ret i1 %res +} + +define i1 @vscale_i64_noattr() { +; CHECK-LABEL: define i1 @vscale_i64_noattr() { +; CHECK-NEXT: [[SCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[CMP2:%.*]] = icmp ule i64 [[SCALE]], 16 +; CHECK-NEXT: [[RES:%.*]] = and i1 true, [[CMP2]] +; CHECK-NEXT: ret i1 [[RES]] +; + %scale = call i64 @llvm.vscale.i64() + %cmp1 = icmp uge i64 %scale, 1 + %cmp2 = icmp ule i64 %scale, 16 + %res = and i1 %cmp1, %cmp2 + ret i1 %res +} + +define i1 @vscale_i64_attr() vscale_range(1, 16) { +; CHECK-LABEL: define i1 @vscale_i64_attr( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[SCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: ret i1 true +; + %scale = call i64 @llvm.vscale.i64() + %cmp1 = icmp uge i64 %scale, 1 + %cmp2 = icmp ule i64 %scale, 16 + %res = and i1 %cmp1, %cmp2 + ret i1 %res +} + +define i32 @vscale_branch_elim(i32 %x) vscale_range(1, 16) { +; CHECK-LABEL: define i32 @vscale_branch_elim( +; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[BOUND:%.*]] = shl nuw nsw i32 [[SCALE]], 3 +; CHECK-NEXT: br label %[[IF_END:.*]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: ret i32 [[X]] +; +entry: + %scale = call i32 @llvm.vscale.i32() + %bound = shl nsw nuw i32 %scale, 3 + %cmp = icmp uge i32 1, %bound + br i1 %cmp, label %if.then, label %if.end + +if.then: + %double = mul i32 %x, 2 + br label %if.end + +if.end: + %res = phi i32 [ %x, %entry ], [ %double, %if.then] + ret i32 %res +} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll index e4b6c06b79fc1..8b548e355dadc 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll @@ -28,9 +28,9 @@ ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '2' +; YAML-NEXT: - Cost: '8' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '9' +; YAML-NEXT: - TreeSize: '5' define void @foo() personality ptr @bar { ; CHECK-LABEL: @foo( @@ -44,8 +44,10 @@ define void @foo() personality ptr @bar { ; CHECK-NEXT: ret void ; CHECK: bb3: ; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i64> [ [[TMP4:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = invoke i64 poison(ptr addrspace(1) nonnull poison, i64 0, i64 0, i64 poison) [ "deopt"() ] -; CHECK-NEXT: to label [[BB4:%.*]] unwind label [[BB10:%.*]] +; CHECK-NEXT: to label [[BB4:%.*]] unwind label [[BB10:%.*]] ; CHECK: bb4: ; CHECK-NEXT: br i1 poison, label [[BB11:%.*]], label [[BB5:%.*]] ; CHECK: bb5: @@ -55,9 +57,8 @@ define void @foo() personality ptr @bar { ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb7: ; CHECK-NEXT: [[LOCAL_5_84111:%.*]] = phi i64 [ poison, [[BB8]] ], [ poison, [[BB5]] ] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[LOCAL_5_84111]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = invoke i64 poison(ptr addrspace(1) nonnull poison, i64 poison, i64 poison, i64 poison) [ "deopt"() ] -; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] +; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] ; CHECK: bb8: ; CHECK-NEXT: br i1 poison, label [[BB7]], label [[BB6]] ; CHECK: bb9: @@ -65,16 +66,22 @@ define void @foo() personality ptr @bar { ; CHECK-NEXT: [[TMP7]] = phi <2 x i64> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ] ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP8]] = phi <2 x i64> [ [[TMP2]], [[BB3]] ] +; CHECK-NEXT: [[LOCAL_10_38123_LCSSA:%.*]] = phi i64 [ [[TMP10]], [[BB3]] ] +; CHECK-NEXT: [[LOCAL_5_33118_LCSSA:%.*]] = phi i64 [ [[TMP5]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i64 } -; CHECK-NEXT: cleanup +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[LOCAL_10_38123_LCSSA]], i32 0 +; CHECK-NEXT: [[TMP8]] = insertelement <2 x i64> [[TMP12]], i64 [[LOCAL_5_33118_LCSSA]], i32 1 ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP9]] = phi <2 x i64> [ [[TMP5]], [[BB7]] ] +; CHECK-NEXT: [[LOCAL_10_89113_LCSSA:%.*]] = phi i64 [ poison, [[BB7]] ] +; CHECK-NEXT: [[LOCAL_5_84111_LCSSA:%.*]] = phi i64 [ [[LOCAL_5_84111]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i64 } -; CHECK-NEXT: cleanup +; CHECK-NEXT: cleanup +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[LOCAL_10_89113_LCSSA]], i32 0 +; CHECK-NEXT: [[TMP9]] = insertelement <2 x i64> [[TMP11]], i64 [[LOCAL_5_84111_LCSSA]], i32 1 ; CHECK-NEXT: br label [[BB9]] ; bb1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll index 6f5f933761092..f1f83c0663099 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -18,13 +18,15 @@ define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btCons ; CHECK: land.lhs.true.i.1: ; CHECK-NEXT: br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]] ; CHECK: if.then7.1: -; CHECK-NEXT: store i32 1, ptr [[INFO]], align 4 -; CHECK-NEXT: store i32 5, ptr [[NUB5]], align 4 +; CHECK-NEXT: store <2 x i32> , ptr [[INFO]], align 4 ; CHECK-NEXT: br label [[FOR_INC_1]] ; CHECK: for.inc.1: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], -; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[INFO]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 5, [[IF_THEN7_1]] ], [ 6, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 1, [[IF_THEN7_1]] ], [ 0, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[TMP1]], 1 +; CHECK-NEXT: store i32 [[INC_2]], ptr [[INFO]], align 4 +; CHECK-NEXT: [[DEC_2:%.*]] = add nsw i32 [[TMP0]], -1 +; CHECK-NEXT: store i32 [[DEC_2]], ptr [[NUB5]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll new file mode 100644 index 0000000000000..469f165d302a9 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/external-reduced-value-vectorized.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @test(ptr %c, i16 %a, i16 %0) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: ptr [[C:%.*]], i16 [[A:%.*]], i16 [[TMP0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i1> [[TMP3]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i16> [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i16 [[A]], -2 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i1> [[TMP10]], i1 [[TMP9]], i32 7 +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP11]], <4 x i1> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP13:%.*]] = freeze <8 x i1> [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[C]], align 4 +; CHECK-NEXT: ret i32 0 +; +entry: + %tobool = icmp ne i16 %a, 0 + %1 = zext i1 %tobool to i16 + %cmp3 = icmp ugt i16 %0, %1 + %2 = and i1 %tobool, %cmp3 + %tobool.1 = icmp ne i16 %a, 0 + %3 = zext i1 %tobool.1 to i16 + %cmp3.1 = icmp ugt i16 %0, %3 + %4 = and i1 %tobool.1, %cmp3.1 + %5 = select i1 %2, i1 %4, i1 false + %tobool.2 = icmp ne i16 %a, 0 + %6 = zext i1 %tobool.2 to i16 + %cmp3.2 = icmp ugt i16 %0, %6 + %7 = and i1 %tobool.2, %cmp3.2 + %8 = select i1 %5, i1 %7, i1 false + %tobool.3 = icmp ne i16 %a, 0 + %9 = zext i1 %tobool.3 to i16 + %cmp3.3 = icmp ugt i16 %a, %9 + %10 = icmp ult i16 %a, -2 + %11 = and i1 %10, %cmp3.3 + %12 = select i1 %8, i1 %11, i1 false + %13 = zext i1 %12 to i32 + store i32 %13, ptr %c, align 4 + ret i32 0 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-phi-in-landingpad.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-phi-in-landingpad.ll index 7476c77c58320..adbe9c33140b6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-phi-in-landingpad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-phi-in-landingpad.ll @@ -10,10 +10,10 @@ define void @test() personality ptr null { ; CHECK-NEXT: invoke void null() ; CHECK-NEXT: to label %[[BB65]] unwind label %[[BB4]] ; CHECK: [[BB4]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ poison, %[[BB2]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, %[[BB]] ], [ 0, %[[BB2]] ] +; CHECK-NEXT: [[PHI6:%.*]] = phi i32 [ 0, %[[BB]] ], [ 0, %[[BB2]] ] ; CHECK-NEXT: [[LANDINGPAD:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 ; CHECK-NEXT: call void null(i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]], i32 [[TMP1]]) ; CHECK-NEXT: ret void ; CHECK: [[BB65]]: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll index 578d463a9489d..79698e22c1d89 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll @@ -7,16 +7,24 @@ define void @test1(ptr %a, ptr %b, ptr %c) #0 personality ptr @__CxxFrameHandler ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: invoke void @_CxxThrowException(ptr null, ptr null) -; CHECK-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]] +; CHECK-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: catch.dispatch: ; CHECK-NEXT: [[TMP0:%.*]] = catchswitch within none [label %catch] unwind to caller ; CHECK: catch: ; CHECK-NEXT: [[TMP1:%.*]] = catchpad within [[TMP0]] [ptr null, i32 64, ptr null] -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP6]]) [ "funclet"(token [[TMP1]]) ] -; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[I0:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[I1:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[I0]], [[I1]] +; CHECK-NEXT: [[CALL:%.*]] = tail call double @floor(double [[MUL]]) #[[ATTR1:[0-9]+]] [ "funclet"(token [[TMP1]]) ] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 1 +; CHECK-NEXT: [[I3:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 +; CHECK-NEXT: [[I4:%.*]] = load double, ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[MUL5:%.*]] = fmul double [[I3]], [[I4]] +; CHECK-NEXT: [[CALL5:%.*]] = tail call double @floor(double [[MUL5]]) #[[ATTR1]] [ "funclet"(token [[TMP1]]) ] +; CHECK-NEXT: store double [[CALL]], ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[C]], i64 1 +; CHECK-NEXT: store double [[CALL5]], ptr [[ARRAYIDX5]], align 8 ; CHECK-NEXT: catchret from [[TMP1]] to label [[TRY_CONT:%.*]] ; CHECK: try.cont: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll index 47b42bc8f32a7..2a036cc8fe326 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll @@ -14,6 +14,8 @@ define void @foo() personality ptr @bar { ; CHECK-NEXT: ret void ; CHECK: bb3: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 0, i32 0, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB4:%.*]] unwind label [[BB10:%.*]] ; CHECK: bb4: @@ -25,7 +27,6 @@ define void @foo() personality ptr @bar { ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb7: ; CHECK-NEXT: [[LOCAL_5_84111:%.*]] = phi i32 [ poison, [[BB8]] ], [ poison, [[BB5]] ] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] ; CHECK: bb8: @@ -33,19 +34,25 @@ define void @foo() personality ptr @bar { ; CHECK: bb9: ; CHECK-NEXT: [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ] ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ] -; CHECK-NEXT: [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP8]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] +; CHECK-NEXT: [[LOCAL_10_38123_LCSSA:%.*]] = phi i32 [ [[TMP10]], [[BB3]] ] +; CHECK-NEXT: [[LOCAL_5_33118_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_10_38123_LCSSA]], i32 0 +; CHECK-NEXT: [[TMP8]] = insertelement <2 x i32> [[TMP12]], i32 [[LOCAL_5_33118_LCSSA]], i32 1 ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ] +; CHECK-NEXT: [[LOCAL_10_89113_LCSSA:%.*]] = phi i32 [ poison, [[BB7]] ] +; CHECK-NEXT: [[LOCAL_5_84111_LCSSA:%.*]] = phi i32 [ [[LOCAL_5_84111]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_10_89113_LCSSA]], i32 0 +; CHECK-NEXT: [[TMP9]] = insertelement <2 x i32> [[TMP11]], i32 [[LOCAL_5_84111_LCSSA]], i32 1 ; CHECK-NEXT: br label [[BB9]] ; bb1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll index c758d05cab800..3f765d5a51e88 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll @@ -7,14 +7,17 @@ define void @test_phi_in_landingpad() personality ptr ; CHECK-LABEL: @test_phi_in_landingpad( ; CHECK-NEXT: entry: ; CHECK-NEXT: invoke void @foo() -; CHECK-NEXT: to label [[INNER:%.*]] unwind label [[LPAD:%.*]] +; CHECK-NEXT: to label [[INNER:%.*]] unwind label [[LPAD:%.*]] ; CHECK: inner: ; CHECK-NEXT: invoke void @foo() -; CHECK-NEXT: to label [[DONE:%.*]] unwind label [[LPAD]] +; CHECK-NEXT: to label [[DONE:%.*]] unwind label [[LPAD]] ; CHECK: lpad: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ undef, [[ENTRY:%.*]] ], [ undef, [[INNER]] ] +; CHECK-NEXT: [[X1:%.*]] = phi double [ undef, [[ENTRY:%.*]] ], [ undef, [[INNER]] ] +; CHECK-NEXT: [[Y1:%.*]] = phi double [ undef, [[ENTRY]] ], [ undef, [[INNER]] ] ; CHECK-NEXT: [[TMP1:%.*]] = landingpad { ptr, i32 } -; CHECK-NEXT: catch ptr null +; CHECK-NEXT: catch ptr null +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[X1]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> [[TMP3]], double [[Y1]], i32 1 ; CHECK-NEXT: br label [[DONE]] ; CHECK: done: ; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ undef, [[INNER]] ], [ [[TMP0]], [[LPAD]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll index 9979bb9170d48..eed772b0dd104 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll @@ -9,7 +9,7 @@ define void @test1(ptr %p) personality ptr @__CxxFrameHandler3 { ; CHECK-LABEL: @test1( ; CHECK-NEXT: invoke.cont: ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8 -; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT: [[LOAD1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0 ; CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[P]], align 8 ; CHECK-NEXT: invoke void @throw() ; CHECK-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll index 590e5a67bd4ce..f9815529a2375 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -34,16 +34,24 @@ define void @hoge() { ; CHECK-NEXT: [[T23:%.*]] = sub nsw i32 undef, [[T4]] ; CHECK-NEXT: [[T24:%.*]] = sub i32 [[T23]], undef ; CHECK-NEXT: [[T25:%.*]] = add nsw i32 [[T24]], -49 -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T22]], -33 -; CHECK-NEXT: [[T35:%.*]] = add nsw i32 [[T24]], -33 -; CHECK-NEXT: [[T40:%.*]] = add nsw i32 [[T22]], -17 -; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 undef, [[T25]] +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[T25]], undef ; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 undef, i32 [[T25]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp slt i32 [[T30]], [[T35]] +; CHECK-NEXT: [[T28:%.*]] = icmp sgt i32 [[OP_RDX1]], undef +; CHECK-NEXT: [[T30:%.*]] = select i1 [[T28]], i32 undef, i32 [[OP_RDX1]] +; CHECK-NEXT: [[T32:%.*]] = add nsw i32 [[T22]], -33 +; CHECK-NEXT: [[T31:%.*]] = icmp sgt i32 [[T32]], undef +; CHECK-NEXT: [[T35:%.*]] = select i1 [[T31]], i32 undef, i32 [[T32]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[T35]], [[T30]] ; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[T30]], i32 [[T35]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = icmp slt i32 [[OP_RDX1]], [[OP_RDX3]] -; CHECK-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] -; CHECK-NEXT: [[OP_RDX6:%.*]] = icmp slt i32 [[OP_RDX5]], [[T40]] +; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T24]], -33 +; CHECK-NEXT: [[T36:%.*]] = icmp sgt i32 [[T39]], undef +; CHECK-NEXT: [[T37:%.*]] = select i1 [[T36]], i32 undef, i32 [[T39]] +; CHECK-NEXT: [[T38:%.*]] = icmp sgt i32 [[T37]], [[OP_RDX3]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = select i1 [[T38]], i32 [[OP_RDX3]], i32 [[T37]] +; CHECK-NEXT: [[T42:%.*]] = add nsw i32 [[T22]], -17 +; CHECK-NEXT: [[T41:%.*]] = icmp sgt i32 [[T42]], undef +; CHECK-NEXT: [[T40:%.*]] = select i1 [[T41]], i32 undef, i32 [[T42]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = icmp sgt i32 [[T40]], [[OP_RDX5]] ; CHECK-NEXT: [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[T40]] ; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX7]] ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll index f32e315142767..aec81086105d6 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec.ll @@ -355,3 +355,57 @@ entry: %10 = icmp ne <2 x i8> %8, zeroinitializer ret void } + +define void @test12() { +; CHECK-LABEL: @test12( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr null, i64 33 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr null, i64 50 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr null, i64 75 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP6]], <8 x float> [[TMP3]], i64 8) +; CHECK-NEXT: [[TMP8:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v16f32(<32 x float> [[TMP7]], <16 x float> [[TMP5]], i64 16) +; CHECK-NEXT: [[TMP9:%.*]] = fpext <32 x float> [[TMP8]] to <32 x double> +; CHECK-NEXT: [[TMP10:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> poison, <8 x double> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP10]], <8 x double> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP12:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP11]], <8 x double> zeroinitializer, i64 16) +; CHECK-NEXT: [[TMP13:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP12]], <8 x double> zeroinitializer, i64 24) +; CHECK-NEXT: [[TMP14:%.*]] = fadd <32 x double> [[TMP13]], [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = fptrunc <32 x double> [[TMP14]] to <32 x float> +; CHECK-NEXT: [[TMP16:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP17:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP16]], <8 x float> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP18:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP17]], <8 x float> zeroinitializer, i64 16) +; CHECK-NEXT: [[TMP19:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP18]], <8 x float> zeroinitializer, i64 24) +; CHECK-NEXT: [[TMP20:%.*]] = fcmp ogt <32 x float> [[TMP19]], [[TMP15]] +; CHECK-NEXT: ret void +; +entry: + %0 = getelementptr float, ptr null, i64 33 + %1 = getelementptr float, ptr null, i64 41 + %2 = getelementptr float, ptr null, i64 50 + %3 = getelementptr float, ptr null, i64 75 + %4 = load <8 x float>, ptr %0, align 4 + %5 = load <8 x float>, ptr %1, align 4 + %6 = load <8 x float>, ptr %2, align 4 + %7 = load <8 x float>, ptr %3, align 4 + %8 = fpext <8 x float> %4 to <8 x double> + %9 = fpext <8 x float> %5 to <8 x double> + %10 = fpext <8 x float> %6 to <8 x double> + %11 = fpext <8 x float> %7 to <8 x double> + %12 = fadd <8 x double> zeroinitializer, %8 + %13 = fadd <8 x double> zeroinitializer, %9 + %14 = fadd <8 x double> zeroinitializer, %10 + %15 = fadd <8 x double> zeroinitializer, %11 + %16 = fptrunc <8 x double> %12 to <8 x float> + %17 = fptrunc <8 x double> %13 to <8 x float> + %18 = fptrunc <8 x double> %14 to <8 x float> + %19 = fptrunc <8 x double> %15 to <8 x float> + %20 = fcmp ogt <8 x float> zeroinitializer, %16 + %21 = fcmp ogt <8 x float> zeroinitializer, %17 + %22 = fcmp ogt <8 x float> zeroinitializer, %18 + %23 = fcmp ogt <8 x float> zeroinitializer, %19 + ret void +} diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll index 10a3e65e5f57d..385e37e2750d1 100644 --- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll +++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll @@ -28,7 +28,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0 ; CHECK-NEXT: [[I_INITIAL:%.*]] = load volatile i32, ptr addrspace(1) [[GEP]], align 4 ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: LOOP.HEADER: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW3:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[FLOW3:%.*]] ] ; CHECK-NEXT: call void asm sideeffect "s_nop 0x100b ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 [[TMP12]] @@ -49,8 +49,8 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0 ; CHECK-NEXT: [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52 ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: Flow2: -; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP9:%.*]], [[FLOW]] ] +; CHECK-NEXT: [[TMP3]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ] ; CHECK-NEXT: br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]] ; CHECK: INNER_LOOP: ; CHECK-NEXT: [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ] @@ -66,20 +66,19 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0 ; CHECK-NEXT: [[LOAD13:%.*]] = icmp uge i32 [[TMP16]], 271 ; CHECK-NEXT: br i1 [[LOAD13]], label [[INCREMENT_I]], label [[FLOW1:%.*]] ; CHECK: Flow3: -; CHECK-NEXT: [[TMP5]] = phi i32 [ [[TMP3]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ] -; CHECK-NEXT: br i1 [[TMP6]], label [[FLOW4:%.*]], label [[LOOP_HEADER]] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ] +; CHECK-NEXT: br i1 [[TMP5]], label [[FLOW4:%.*]], label [[LOOP_HEADER]] ; CHECK: Flow4: -; CHECK-NEXT: br i1 [[TMP8:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]] +; CHECK-NEXT: br i1 [[TMP7:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]] ; CHECK: bb64: ; CHECK-NEXT: call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]] ; CHECK-NEXT: br label [[RETURN]] ; CHECK: Flow: -; CHECK-NEXT: [[TMP7]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ] -; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] -; CHECK-NEXT: [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] -; CHECK-NEXT: [[TMP10:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ] -; CHECK-NEXT: br i1 [[TMP10]], label [[BB18]], label [[FLOW2]] +; CHECK-NEXT: [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ] +; CHECK-NEXT: br i1 [[TMP9]], label [[BB18]], label [[FLOW2]] ; CHECK: INCREMENT_I: ; CHECK-NEXT: [[INC_I]] = add i32 [[I]], 1 ; CHECK-NEXT: call void asm sideeffect "s_nop 0x1336 diff --git a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll index c832b7d1394a8..46881ec827286 100644 --- a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll +++ b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll @@ -7,8 +7,8 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[HEADER:.*]] ; CHECK: [[HEADER]]: -; CHECK-NEXT: [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW2:.*]] ] -; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW2]] ] +; CHECK-NEXT: [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ] +; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ] ; CHECK-NEXT: [[CC:%.*]] = icmp sge i32 [[IND]], [[X]] ; CHECK-NEXT: br i1 [[CC]], label %[[ELSE:.*]], label %[[FLOW:.*]] ; CHECK: [[FLOW]]: @@ -23,20 +23,17 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 { ; CHECK-NEXT: [[CC2]] = icmp slt i32 [[IND]], [[Y]] ; CHECK-NEXT: br label %[[FLOW]] ; CHECK: [[FLOW1]]: -; CHECK-NEXT: [[TMP3:%.*]] = phi float [ undef, %[[IF]] ], [ [[TMP0]], %[[FLOW]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[TMP0]], %[[FLOW]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ true, %[[IF]] ], [ [[TMP1]], %[[FLOW]] ] -; CHECK-NEXT: br i1 [[TMP5]], label %[[LATCH:.*]], label %[[FLOW2]] +; CHECK-NEXT: [[TMP8]] = phi float [ [[V_IF]], %[[IF]] ], [ [[TMP0]], %[[FLOW]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ true, %[[IF]] ], [ [[TMP1]], %[[FLOW]] ] +; CHECK-NEXT: br i1 [[TMP4]], label %[[LATCH:.*]], label %[[FLOW2]] ; CHECK: [[LATCH]]: ; CHECK-NEXT: [[IND_INC:%.*]] = add i32 [[IND]], 1 ; CHECK-NEXT: [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]] ; CHECK-NEXT: br label %[[FLOW2]] ; CHECK: [[FLOW2]]: -; CHECK-NEXT: [[TMP6]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ] -; CHECK-NEXT: [[TMP7]] = phi float [ [[TMP4]], %[[LATCH]] ], [ undef, %[[FLOW1]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi float [ [[TMP4]], %[[LATCH]] ], [ [[TMP3]], %[[FLOW1]] ] -; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ] -; CHECK-NEXT: br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]] +; CHECK-NEXT: [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ] +; CHECK-NEXT: br i1 [[TMP6]], label %[[END:.*]], label %[[HEADER]] ; CHECK: [[END]]: ; CHECK-NEXT: ret float [[TMP8]] ; @@ -75,8 +72,8 @@ define float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[HEADER:.*]] ; CHECK: [[HEADER]]: -; CHECK-NEXT: [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW2:.*]] ] -; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW2]] ] +; CHECK-NEXT: [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ] +; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ] ; CHECK-NEXT: [[CC:%.*]] = icmp sge i32 [[IND]], [[X]] ; CHECK-NEXT: br i1 [[CC]], label %[[IF:.*]], label %[[FLOW:.*]] ; CHECK: [[IF]]: @@ -91,20 +88,17 @@ define float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 { ; CHECK-NEXT: [[CC2:%.*]] = icmp slt i32 [[IND]], [[Y]] ; CHECK-NEXT: br label %[[FLOW1]] ; CHECK: [[FLOW1]]: -; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[V_1]], %[[ELSE]] ], [ undef, %[[FLOW]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[V_1]], %[[ELSE]] ], [ [[TMP0]], %[[FLOW]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[CC2]], %[[ELSE]] ], [ [[TMP1]], %[[FLOW]] ] -; CHECK-NEXT: br i1 [[TMP5]], label %[[LATCH:.*]], label %[[FLOW2]] +; CHECK-NEXT: [[TMP8]] = phi float [ [[V_1]], %[[ELSE]] ], [ [[TMP0]], %[[FLOW]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[CC2]], %[[ELSE]] ], [ [[TMP1]], %[[FLOW]] ] +; CHECK-NEXT: br i1 [[TMP4]], label %[[LATCH:.*]], label %[[FLOW2]] ; CHECK: [[LATCH]]: ; CHECK-NEXT: [[IND_INC:%.*]] = add i32 [[IND]], 1 ; CHECK-NEXT: [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]] ; CHECK-NEXT: br label %[[FLOW2]] ; CHECK: [[FLOW2]]: -; CHECK-NEXT: [[TMP6]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ] -; CHECK-NEXT: [[TMP7]] = phi float [ [[TMP4]], %[[LATCH]] ], [ undef, %[[FLOW1]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi float [ [[TMP4]], %[[LATCH]] ], [ [[TMP3]], %[[FLOW1]] ] -; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ] -; CHECK-NEXT: br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]] +; CHECK-NEXT: [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ] +; CHECK-NEXT: br i1 [[TMP6]], label %[[END:.*]], label %[[HEADER]] ; CHECK: [[END]]: ; CHECK-NEXT: ret float [[TMP8]] ; @@ -143,9 +137,9 @@ define < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32 ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[HEADER:.*]] ; CHECK: [[HEADER]]: -; CHECK-NEXT: [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW1:.*]] ] -; CHECK-NEXT: [[V_COPY:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW1]] ] -; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[FLOW1]] ] +; CHECK-NEXT: [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW1:.*]] ] +; CHECK-NEXT: [[V_COPY:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW1]] ] +; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FLOW1]] ] ; CHECK-NEXT: [[CC:%.*]] = icmp slt i32 [[IND]], [[X]] ; CHECK-NEXT: [[CC_INV:%.*]] = xor i1 [[CC]], true ; CHECK-NEXT: br i1 [[CC]], label %[[IF:.*]], label %[[FLOW:.*]] @@ -156,23 +150,18 @@ define < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32 ; CHECK-NEXT: [[CC2:%.*]] = icmp slt i32 [[IND]], [[Y]] ; CHECK-NEXT: br label %[[FLOW]] ; CHECK: [[FLOW]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ undef, %[[HEADER]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_COPY]], %[[HEADER]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_1]], %[[HEADER]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi i1 [ [[CC2]], %[[IF]] ], [ [[CC_INV]], %[[HEADER]] ] -; CHECK-NEXT: br i1 [[TMP3]], label %[[LATCH:.*]], label %[[FLOW1]] +; CHECK-NEXT: [[TMP7]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_COPY]], %[[HEADER]] ] +; CHECK-NEXT: [[TMP8]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_1]], %[[HEADER]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ [[CC2]], %[[IF]] ], [ [[CC_INV]], %[[HEADER]] ] +; CHECK-NEXT: br i1 [[TMP2]], label %[[LATCH:.*]], label %[[FLOW1]] ; CHECK: [[LATCH]]: ; CHECK-NEXT: [[IND_INC:%.*]] = add i32 [[IND]], 1 ; CHECK-NEXT: [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]] ; CHECK-NEXT: br label %[[FLOW1]] ; CHECK: [[FLOW1]]: -; CHECK-NEXT: [[TMP4]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW]] ] -; CHECK-NEXT: [[TMP5]] = phi float [ [[TMP1]], %[[LATCH]] ], [ undef, %[[FLOW]] ] -; CHECK-NEXT: [[TMP6]] = phi float [ [[TMP2]], %[[LATCH]] ], [ undef, %[[FLOW]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi float [ [[TMP1]], %[[LATCH]] ], [ [[TMP0]], %[[FLOW]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi float [ [[TMP2]], %[[LATCH]] ], [ [[TMP0]], %[[FLOW]] ] -; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW]] ] -; CHECK-NEXT: br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]] +; CHECK-NEXT: [[TMP3]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW]] ] +; CHECK-NEXT: br i1 [[TMP4]], label %[[END:.*]], label %[[HEADER]] ; CHECK: [[END]]: ; CHECK-NEXT: [[PACKED0:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0 ; CHECK-NEXT: [[PACKED1:%.*]] = insertelement <2 x float> [[PACKED0]], float [[TMP7]], i32 1 diff --git a/llvm/test/Transforms/Util/assume-simplify.ll b/llvm/test/Transforms/Util/assume-simplify.ll index 1fd807429d642..d336d4420c90a 100644 --- a/llvm/test/Transforms/Util/assume-simplify.ll +++ b/llvm/test/Transforms/Util/assume-simplify.ll @@ -23,11 +23,11 @@ define i32 @test1(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3) { ; CHECK-NEXT: [[I8:%.*]] = load i32, ptr [[ARG1]], align 4 ; CHECK-NEXT: [[I9:%.*]] = add nsw i32 [[I7]], [[I8]] ; CHECK-NEXT: call void @may_throw() -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[ARG1]], i64 4), "ignore"(ptr undef) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[ARG1]], i64 4), "ignore"(ptr poison) ] ; CHECK-NEXT: store i32 [[I9]], ptr [[ARG1]], align 4 ; CHECK-NEXT: br label [[B:%.*]] ; CHECK: A: -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[ARG]], i64 4), "ignore"(ptr undef, i64 4), "ignore"(ptr undef) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[ARG]], i64 4), "ignore"(ptr poison, i64 4), "ignore"(ptr poison) ] ; CHECK-NEXT: br label [[B]] ; CHECK: B: ; CHECK-NEXT: ret i32 0 @@ -278,7 +278,7 @@ define i32 @test5A(ptr dereferenceable(8) %p, i32 %i) { ; CHECK-SAME: (ptr dereferenceable(32) [[P:%.*]], i32 [[I:%.*]]) { ; CHECK-NEXT: bb: ; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I]], 0 -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "cold"(), "ignore"(ptr undef, i32 32) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "cold"(), "ignore"(ptr poison, i32 32) ] ; CHECK-NEXT: br i1 [[COND]], label [[A:%.*]], label [[B:%.*]] ; CHECK: A: ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index 66fe11369d88b..459ede173b841 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -937,10 +937,9 @@ define <4 x i64> @cast_mismatched_types(<4 x i32> %x) { define <4 x float> @fadd_mismatched_types(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: @fadd_mismatched_types( -; CHECK-NEXT: [[SHUF_X:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[SHUF_Y:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[FADD:%.*]] = fadd fast <2 x float> [[SHUF_X]], [[SHUF_Y]] -; CHECK-NEXT: [[EXTSHUF:%.*]] = shufflevector <2 x float> [[FADD]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[EXTSHUF:%.*]] = fadd fast <4 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x float> [[EXTSHUF]] ; %shuf.x = shufflevector <4 x float> %x, <4 x float> poison, <2 x i32> diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll index e94868c7b9e5b..8db1990dcbb5d 100644 --- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll +++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll @@ -9,11 +9,10 @@ declare void @use_v4f64(<4 x double>) define <4 x double> @fadd_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: define <4 x double> @fadd_v4f64( ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[POST1:%.*]] = shufflevector <4 x double> [[POST]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[POST1]] +; CHECK-NEXT: ret <4 x double> [[POST]] ; %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> @@ -25,11 +24,10 @@ define <4 x double> @fadd_v4f64(<4 x double> %a, <4 x double> %b) { define <4 x double> @fadd_v4f64_poison_idx(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: define <4 x double> @fadd_v4f64_poison_idx( ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[POST1:%.*]] = shufflevector <4 x double> [[POST]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[POST1]] +; CHECK-NEXT: ret <4 x double> [[POST]] ; %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> @@ -41,11 +39,10 @@ define <4 x double> @fadd_v4f64_poison_idx(<4 x double> %a, <4 x double> %b) { define <4 x double> @fadd_mixed_types(<4 x double> %a, <2 x double> %b) { ; CHECK-LABEL: define <4 x double> @fadd_mixed_types( ; CHECK-SAME: <4 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[POST1:%.*]] = shufflevector <4 x double> [[POST]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[POST1]] +; CHECK-NEXT: ret <4 x double> [[POST]] ; %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> %b1 = shufflevector <2 x double> %b, <2 x double> poison, <4 x i32> @@ -95,11 +92,10 @@ define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> % define <4 x i32> @sdiv_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: define <4 x i32> @sdiv_v4i32( ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = sdiv <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[POST1:%.*]] = shufflevector <4 x i32> [[POST]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[POST1]] +; CHECK-NEXT: ret <4 x i32> [[POST]] ; %a1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> %b1 = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml index 408b9c3993428..6702725e4fc8a 100644 --- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml +++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml @@ -34,7 +34,7 @@ # # CHECK: << Total TLI yes SDK no: 18 # CHECK: >> Total TLI no SDK yes: 0 -# CHECK: == Total TLI yes SDK yes: 268 +# CHECK: == Total TLI yes SDK yes: 271 # # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*) # WRONG_DETAIL: >> TLI no SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int) @@ -48,14 +48,14 @@ # WRONG_DETAIL: << TLI yes SDK no : 'fminimum_numl' # WRONG_SUMMARY: << Total TLI yes SDK no: 19{{$}} # WRONG_SUMMARY: >> Total TLI no SDK yes: 1{{$}} -# WRONG_SUMMARY: == Total TLI yes SDK yes: 267 +# WRONG_SUMMARY: == Total TLI yes SDK yes: 270 # ## The -COUNT suffix doesn't care if there are too many matches, so check ## the exact count first; the two directives should add up to that. ## Yes, this means additions to TLI will fail this test, but the argument ## to -COUNT can't be an expression. -# AVAIL: TLI knows 519 symbols, 286 available -# AVAIL-COUNT-286: {{^}} available +# AVAIL: TLI knows 522 symbols, 289 available +# AVAIL-COUNT-289: {{^}} available # AVAIL-NOT: {{^}} available # UNAVAIL-COUNT-233: not available # UNAVAIL-NOT: not available @@ -602,6 +602,18 @@ DynamicSymbols: Type: STT_FUNC Section: .text Binding: STB_GLOBAL + - Name: hypot + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: hypotf + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + - Name: hypotl + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL - Name: isdigit Type: STT_FUNC Section: .text diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp index 98f8989d4e6e9..982d00c5d3359 100644 --- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp +++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp @@ -249,6 +249,9 @@ TEST_F(TargetLibraryInfoTest, ValidProto) { "declare %struct* @getpwnam(i8*)\n" "declare i8* @gets(i8*)\n" "declare i32 @gettimeofday(%struct*, i8*)\n" + "declare double @hypot(double, double)\n" + "declare float @hypotf(float, float)\n" + "declare x86_fp80 @hypotl(x86_fp80, x86_fp80)\n" "declare i32 @_Z7isasciii(i32)\n" "declare i32 @_Z7isdigiti(i32)\n" "declare i64 @labs(i64)\n" diff --git a/llvm/unittests/CGData/CMakeLists.txt b/llvm/unittests/CGData/CMakeLists.txt index 792b323130b47..0bdb9e1f08c70 100644 --- a/llvm/unittests/CGData/CMakeLists.txt +++ b/llvm/unittests/CGData/CMakeLists.txt @@ -9,6 +9,8 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(CGDataTests OutlinedHashTreeRecordTest.cpp OutlinedHashTreeTest.cpp + StableFunctionMapRecordTest.cpp + StableFunctionMapTest.cpp ) target_link_libraries(CGDataTests PRIVATE LLVMTestingSupport) diff --git a/llvm/unittests/CGData/StableFunctionMapRecordTest.cpp b/llvm/unittests/CGData/StableFunctionMapRecordTest.cpp new file mode 100644 index 0000000000000..f5c9afe449da3 --- /dev/null +++ b/llvm/unittests/CGData/StableFunctionMapRecordTest.cpp @@ -0,0 +1,127 @@ +//===- StableFunctionMapRecordTest.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CGData/StableFunctionMapRecord.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +TEST(StableFunctionMapRecordTest, Print) { + StableFunctionMapRecord MapRecord; + StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}}}; + MapRecord.FunctionMap->insert(Func1); + + const char *ExpectedMapStr = R"(--- +- Hash: 1 + FunctionName: Func1 + ModuleName: Mod1 + InstCount: 2 + IndexOperandHashes: + - InstIndex: 0 + OpndIndex: 1 + OpndHash: 3 +... +)"; + std::string MapDump; + raw_string_ostream OS(MapDump); + MapRecord.print(OS); + EXPECT_EQ(ExpectedMapStr, MapDump); +} + +TEST(StableFunctionMapRecordTest, Stable) { + StableFunction Func1{1, "Func2", "Mod1", 1, {}}; + StableFunction Func2{1, "Func3", "Mod1", 1, {}}; + StableFunction Func3{1, "Func1", "Mod2", 1, {}}; + StableFunction Func4{2, "Func4", "Mod3", 1, {}}; + + StableFunctionMapRecord MapRecord1; + MapRecord1.FunctionMap->insert(Func1); + MapRecord1.FunctionMap->insert(Func2); + MapRecord1.FunctionMap->insert(Func3); + MapRecord1.FunctionMap->insert(Func4); + + StableFunctionMapRecord MapRecord2; + MapRecord2.FunctionMap->insert(Func4); + MapRecord2.FunctionMap->insert(Func3); + MapRecord2.FunctionMap->insert(Func2); + MapRecord2.FunctionMap->insert(Func1); + + // Output is sorted by hash (1 < 2), module name (Mod1 < Mod2), and function + // name (Func2 < Func3). + std::string MapDump1; + raw_string_ostream OS1(MapDump1); + MapRecord1.print(OS1); + std::string MapDump2; + raw_string_ostream OS2(MapDump2); + MapRecord2.print(OS2); + EXPECT_EQ(MapDump1, MapDump2); +} + +TEST(StableFunctionMapRecordTest, Serialize) { + StableFunctionMapRecord MapRecord1; + StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}, {{1, 2}, 4}}}; + StableFunction Func2{2, "Func2", "Mod1", 3, {{{0, 1}, 2}}}; + StableFunction Func3{2, "Func3", "Mod1", 3, {{{0, 1}, 3}}}; + MapRecord1.FunctionMap->insert(Func1); + MapRecord1.FunctionMap->insert(Func2); + MapRecord1.FunctionMap->insert(Func3); + + // Serialize and deserialize the map. + SmallVector Out; + raw_svector_ostream OS(Out); + MapRecord1.serialize(OS); + + StableFunctionMapRecord MapRecord2; + const uint8_t *Data = reinterpret_cast(Out.data()); + MapRecord2.deserialize(Data); + + // Two maps should be identical. + std::string MapDump1; + raw_string_ostream OS1(MapDump1); + MapRecord1.print(OS1); + std::string MapDump2; + raw_string_ostream OS2(MapDump2); + MapRecord2.print(OS2); + + EXPECT_EQ(MapDump1, MapDump2); +} + +TEST(StableFunctionMapRecordTest, SerializeYAML) { + StableFunctionMapRecord MapRecord1; + StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}, {{1, 2}, 4}}}; + StableFunction Func2{2, "Func2", "Mod1", 3, {{{0, 1}, 2}}}; + StableFunction Func3{2, "Func3", "Mod1", 3, {{{0, 1}, 3}}}; + MapRecord1.FunctionMap->insert(Func1); + MapRecord1.FunctionMap->insert(Func2); + MapRecord1.FunctionMap->insert(Func3); + + // Serialize and deserialize the map in a YAML format. + std::string Out; + raw_string_ostream OS(Out); + yaml::Output YOS(OS); + MapRecord1.serializeYAML(YOS); + + StableFunctionMapRecord MapRecord2; + yaml::Input YIS(StringRef(Out.data(), Out.size())); + MapRecord2.deserializeYAML(YIS); + + // Two maps should be identical. + std::string MapDump1; + raw_string_ostream OS1(MapDump1); + MapRecord1.print(OS1); + std::string MapDump2; + raw_string_ostream OS2(MapDump2); + MapRecord2.print(OS2); + + EXPECT_EQ(MapDump1, MapDump2); +} + +} // end namespace diff --git a/llvm/unittests/CGData/StableFunctionMapTest.cpp b/llvm/unittests/CGData/StableFunctionMapTest.cpp new file mode 100644 index 0000000000000..5e178dcd66724 --- /dev/null +++ b/llvm/unittests/CGData/StableFunctionMapTest.cpp @@ -0,0 +1,128 @@ +//===- StableFunctionMapTest.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CGData/StableFunctionMap.h" +#include "gmock/gmock-matchers.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +using testing::Contains; +using testing::IsEmpty; +using testing::Key; +using testing::Not; +using testing::Pair; +using testing::SizeIs; + +TEST(StableFunctionMap, Name) { + StableFunctionMap Map; + EXPECT_TRUE(Map.empty()); + EXPECT_TRUE(Map.getNames().empty()); + unsigned ID1 = Map.getIdOrCreateForName("Func1"); + unsigned ID2 = Map.getIdOrCreateForName("Func2"); + unsigned ID3 = Map.getIdOrCreateForName("Func1"); + + EXPECT_THAT(Map.getNames(), SizeIs(2)); + // The different names should return different IDs. + EXPECT_NE(ID1, ID2); + // The same name should return the same ID. + EXPECT_EQ(ID1, ID3); + // The IDs should be valid. + EXPECT_EQ(*Map.getNameForId(ID1), "Func1"); + EXPECT_EQ(*Map.getNameForId(ID2), "Func2"); +} + +TEST(StableFunctionMap, Insert) { + StableFunctionMap Map; + + StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}}}; + StableFunction Func2{1, "Func2", "Mod1", 2, {{{0, 1}, 2}}}; + Map.insert(Func1); + Map.insert(Func2); + // We only have a unique hash, 1 + EXPECT_THAT(Map, SizeIs(1)); + // We have two functions with the same hash which are potentially mergeable. + EXPECT_EQ(Map.size(StableFunctionMap::SizeType::TotalFunctionCount), 2u); + EXPECT_EQ(Map.size(StableFunctionMap::SizeType::MergeableFunctionCount), 2u); +} + +TEST(StableFunctionMap, Merge) { + StableFunctionMap Map1; + StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}}}; + StableFunction Func2{1, "Func2", "Mod1", 2, {{{0, 1}, 2}}}; + StableFunction Func3{2, "Func3", "Mod1", 2, {{{1, 1}, 2}}}; + Map1.insert(Func1); + Map1.insert(Func2); + Map1.insert(Func3); + + StableFunctionMap Map2; + StableFunction Func4{1, "Func4", "Mod2", 2, {{{0, 1}, 4}}}; + StableFunction Func5{2, "Func5", "Mod2", 2, {{{1, 1}, 5}}}; + StableFunction Func6{3, "Func6", "Mod2", 2, {{{1, 1}, 6}}}; + Map2.insert(Func4); + Map2.insert(Func5); + Map2.insert(Func6); + + // Merge two maps. + Map1.merge(Map2); + + // We only have two unique hashes, 1, 2 and 3 + EXPECT_THAT(Map1, SizeIs(3)); + // We have total 6 functions. + EXPECT_EQ(Map1.size(StableFunctionMap::SizeType::TotalFunctionCount), 6u); + // We have 5 mergeable functions. Func6 only has a unique hash, 3. + EXPECT_EQ(Map1.size(StableFunctionMap::SizeType::MergeableFunctionCount), 5u); +} + +TEST(StableFunctionMap, Finalize1) { + StableFunctionMap Map; + StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}}}; + StableFunction Func2{1, "Func2", "Mod2", 3, {{{0, 1}, 2}}}; + Map.insert(Func1); + Map.insert(Func2); + + // Instruction count is mis-matched, so they're not mergeable. + Map.finalize(); + EXPECT_THAT(Map, IsEmpty()); +} + +TEST(StableFunctionMap, Finalize2) { + StableFunctionMap Map; + StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}}}; + StableFunction Func2{1, "Func2", "Mod2", 2, {{{0, 1}, 2}, {{1, 1}, 1}}}; + Map.insert(Func1); + Map.insert(Func2); + + // Operand map size is mis-matched, so they're not mergeable. + Map.finalize(); + EXPECT_THAT(Map, IsEmpty()); +} + +TEST(StableFunctionMap, Finalize3) { + StableFunctionMap Map; + StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}, {{1, 1}, 1}}}; + StableFunction Func2{1, "Func2", "Mod2", 2, {{{0, 1}, 2}, {{1, 1}, 1}}}; + Map.insert(Func1); + Map.insert(Func2); + + // The same operand entry is removed, which is redundant. + Map.finalize(); + auto &M = Map.getFunctionMap(); + EXPECT_THAT(M, SizeIs(1)); + auto &FuncEntries = M.begin()->second; + for (auto &FuncEntry : FuncEntries) { + EXPECT_THAT(*FuncEntry->IndexOperandHashMap, SizeIs(1)); + ASSERT_THAT(*FuncEntry->IndexOperandHashMap, + Not(Contains(Key(Pair(1, 1))))); + } +} + +} // end namespace diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp index 8f22d097681b1..87f96ed28e326 100644 --- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp +++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp @@ -1113,7 +1113,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) { // Create a stack location and ensure it's tracked. SpillLoc SL = {getRegByName("RSP"), StackOffset::getFixed(-8)}; SpillLocationNo SpillNo = *MTracker->getOrTrackSpillLoc(SL); - ASSERT_EQ(MTracker->getNumLocs(), 11u); // Tracks all possible stack locs. + ASSERT_EQ(MTracker->getNumLocs(), 13u); // Tracks all possible stack locs. // Locations are: RSP, stack slots from 2^3 bits wide up to 2^9 for zmm regs, // then slots for sub_8bit_hi and sub_16bit_hi ({8, 8} and {16, 16}). // Finally, one for spilt fp80 registers. @@ -1135,7 +1135,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) { // There are other locations, for things like xmm0, which we're going to // ignore here. - auto [MInLocs, MOutLocs] = allocValueTables(4, 11); + auto [MInLocs, MOutLocs] = allocValueTables(4, 13); // Transfer function: start with nothing. SmallVector TransferFunc; @@ -1170,7 +1170,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) { // function. TransferFunc[1].insert({ALStackLoc, ALDefInBlk1}); TransferFunc[1].insert({HAXStackLoc, HAXDefInBlk1}); - initValueArray(MInLocs, 4, 11); + initValueArray(MInLocs, 4, 13); placeMLocPHIs(*MF, AllBlocks, MInLocs, TransferFunc); EXPECT_EQ(MInLocs[3][ALStackLoc.asU64()], ALPHI); EXPECT_EQ(MInLocs[3][AXStackLoc.asU64()], AXPHI); diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp index dc40e5893b65e..1402c1d5b1398 100644 --- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp @@ -193,9 +193,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) { SDValue And = DAG->getNode(ISD::AND, DL, Int32VT, Op0, Op1); SDValue Xor = DAG->getNode(ISD::XOR, DL, Int32VT, Op1, Op0); SDValue Or = DAG->getNode(ISD::OR, DL, Int32VT, Op0, Op1); - SDNodeFlags DisFlags; - DisFlags.setDisjoint(true); - SDValue DisOr = DAG->getNode(ISD::OR, DL, Int32VT, Op0, Op3, DisFlags); + SDValue DisOr = + DAG->getNode(ISD::OR, DL, Int32VT, Op0, Op3, SDNodeFlags::Disjoint); SDValue SMax = DAG->getNode(ISD::SMAX, DL, Int32VT, Op0, Op1); SDValue SMin = DAG->getNode(ISD::SMIN, DL, Int32VT, Op1, Op0); SDValue UMax = DAG->getNode(ISD::UMAX, DL, Int32VT, Op0, Op1); @@ -293,10 +292,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { SDValue Op3 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Int32VT); SDValue ZExt = DAG->getNode(ISD::ZERO_EXTEND, DL, Int64VT, Op0); - SDNodeFlags NNegFlags; - NNegFlags.setNonNeg(true); SDValue ZExtNNeg = - DAG->getNode(ISD::ZERO_EXTEND, DL, Int64VT, Op3, NNegFlags); + DAG->getNode(ISD::ZERO_EXTEND, DL, Int64VT, Op3, SDNodeFlags::NonNeg); SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op0); SDValue Trunc = DAG->getNode(ISD::TRUNCATE, DL, Int32VT, Op1); diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp index db2b767607b97..2b6cd4984c8de 100644 --- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp @@ -47,6 +47,10 @@ class ReOptimizeLayerTest : public testing::Test { if (Triple.isSystemZ()) GTEST_SKIP(); + // 32-bit X86 is not supported yet. + if (Triple.isX86() && Triple.isArch32Bit()) + GTEST_SKIP(); + if (Triple.isPPC()) GTEST_SKIP(); diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp index bb064b5f63b72..86182ccae0b55 100644 --- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp @@ -41,7 +41,12 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { VPBlockUtils::connectBlocks(VPBB3, VPBB4); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB0); + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB0, ScalarHeaderVPBB); + VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -57,6 +62,7 @@ TEST(VPDominatorTreeTest, DominanceNoRegionsTest) { EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB3), VPBB1); EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB2, VPBB4), VPBB1); EXPECT_EQ(VPDT.findNearestCommonDominator(VPBB4, VPBB4), VPBB4); + delete ScalarHeader; } static void @@ -71,6 +77,8 @@ checkDomChildren(VPDominatorTree &VPDT, VPBlockBase *Src, } TEST(VPDominatorTreeTest, DominanceRegionsTest) { + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); { // 2 consecutive regions. // VPBB0 @@ -115,7 +123,9 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { VPBlockUtils::connectBlocks(R1, R2); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB0); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB0, ScalarHeaderVPBB); VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -195,7 +205,9 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { VPBlockUtils::connectBlocks(R1, VPBB2); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); VPDominatorTree VPDT; VPDT.recalculate(Plan); @@ -208,8 +220,9 @@ TEST(VPDominatorTreeTest, DominanceRegionsTest) { checkDomChildren(VPDT, R2BB2, {R2BB3}); checkDomChildren(VPDT, R2BB3, {}); checkDomChildren(VPDT, R1BB3, {VPBB2}); - checkDomChildren(VPDT, VPBB2, {}); + checkDomChildren(VPDT, VPBB2, {ScalarHeaderVPBB}); } + delete ScalarHeader; } } // namespace diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 01d630124a4bb..93277eed8be12 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -145,6 +145,18 @@ compound=true ] N6 [label = "scalar.ph:\l" + + "Successor(s): ir-bb\\l" + ] + N6 -> N7 [ label=""] + N7 [label = + "ir-bb\:\l" + + " IR %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\l" + + " IR %arr.idx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\l" + + " IR %l1 = load i32, ptr %arr.idx, align 4\l" + + " IR %res = add i32 %l1, 10\l" + + " IR store i32 %res, ptr %arr.idx, align 4\l" + + " IR %indvars.iv.next = add i64 %indvars.iv, 1\l" + + " IR %exitcond = icmp ne i64 %indvars.iv.next, %N\l" + "No successors\l" ] } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 0f170efac207b..0f1b52bd453e0 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -238,6 +238,8 @@ TEST(VPInstructionTest, releaseOperandsAtDeletion) { delete VPV2; } TEST(VPBasicBlockTest, getPlan) { + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); { VPBasicBlock *VPPH = new VPBasicBlock("ph"); VPBasicBlock *VPBB1 = new VPBasicBlock(); @@ -256,7 +258,9 @@ TEST(VPBasicBlockTest, getPlan) { VPBlockUtils::connectBlocks(VPBB3, VPBB4); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, VPBB2->getPlan()); @@ -276,7 +280,9 @@ TEST(VPBasicBlockTest, getPlan) { VPBlockUtils::connectBlocks(VPBB1, R1); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, R1->getPlan()); @@ -306,7 +312,9 @@ TEST(VPBasicBlockTest, getPlan) { VPBlockUtils::connectBlocks(R2, VPBB2); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); EXPECT_EQ(&Plan, VPBB1->getPlan()); EXPECT_EQ(&Plan, R1->getPlan()); @@ -317,9 +325,12 @@ TEST(VPBasicBlockTest, getPlan) { EXPECT_EQ(&Plan, R2BB2->getPlan()); EXPECT_EQ(&Plan, VPBB2->getPlan()); } + delete ScalarHeader; } TEST(VPBasicBlockTest, TraversingIteratorTest) { + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); { // VPBasicBlocks only // VPBB1 @@ -347,7 +358,9 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // Use Plan to properly clean up created blocks. auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB4, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); } { @@ -447,7 +460,9 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // Use Plan to properly clean up created blocks. auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB0); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R2, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB0, ScalarHeaderVPBB); } { @@ -530,7 +545,9 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // Use Plan to properly clean up created blocks. auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); } { @@ -578,7 +595,9 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // Use Plan to properly clean up created blocks. auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); } { @@ -670,8 +689,11 @@ TEST(VPBasicBlockTest, TraversingIteratorTest) { // Use Plan to properly clean up created blocks. auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); } + delete ScalarHeader; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -708,7 +730,10 @@ TEST(VPBasicBlockTest, print) { EXPECT_EQ("EMIT br , ", I3Dump); } - VPlan Plan(VPBB0, TC, VPBB1); + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPlan Plan(VPBB0, TC, VPBB1, ScalarHeaderVPBB); std::string FullDump; raw_string_ostream OS(FullDump); Plan.printDOT(OS); @@ -777,6 +802,7 @@ No successors OS << *I4; EXPECT_EQ("EMIT vp<%5> = mul vp<%3>, vp<%2>", I4Dump); } + delete ScalarHeader; } TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) { @@ -790,7 +816,11 @@ TEST(VPBasicBlockTest, printPlanWithVFsAndUFs) { VPBB1->appendRecipe(I1); VPBB1->setName("bb1"); - VPlan Plan(VPBB0, TC, VPBB1); + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPlan Plan(VPBB0, TC, VPBB1, ScalarHeaderVPBB); Plan.setName("TestPlan"); Plan.addVF(ElementCount::getFixed(4)); @@ -808,6 +838,9 @@ No successors bb1: EMIT vp<%2> = add +Successor(s): ir-bb<> + +ir-bb<>: No successors } )"; @@ -829,6 +862,9 @@ No successors bb1: EMIT vp<%2> = add +Successor(s): ir-bb<> + +ir-bb<>: No successors } )"; @@ -850,11 +886,15 @@ No successors bb1: EMIT vp<%2> = add +Successor(s): ir-bb<> + +ir-bb<>: No successors } )"; EXPECT_EQ(ExpectedStr, FullDump); } + delete ScalarHeader; } #endif @@ -1250,9 +1290,11 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { TEST(VPRecipeTest, dumpRecipeInPlan) { VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); VPBasicBlock *VPBB1 = new VPBasicBlock(); - VPlan Plan(VPBB0, VPBB1); - LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPlan Plan(VPBB0, VPBB1, ScalarHeaderVPBB); IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), @@ -1314,14 +1356,17 @@ TEST(VPRecipeTest, dumpRecipeInPlan) { } delete AI; + delete ScalarHeader; } TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { VPBasicBlock *VPBB0 = new VPBasicBlock("preheader"); VPBasicBlock *VPBB1 = new VPBasicBlock(); - VPlan Plan(VPBB0, VPBB1); - LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB1, ScalarHeaderVPBB); + VPlan Plan(VPBB0, VPBB1, ScalarHeaderVPBB); IntegerType *Int32 = IntegerType::get(C, 32); auto *AI = BinaryOperator::CreateAdd(PoisonValue::get(Int32), @@ -1400,6 +1445,7 @@ TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesInPlan) { testing::ExitedWithCode(0), "EMIT vp<%2> = mul vp<%1>, vp<%1>"); } delete AI; + delete ScalarHeader; } TEST(VPRecipeTest, dumpRecipeUnnamedVPValuesNotInPlanOrBlock) { diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 9958d6ea124f8..edb3f8a295294 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -28,7 +28,12 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { VPBasicBlock *VPBB2 = new VPBasicBlock(); VPRegionBlock *R1 = new VPRegionBlock(VPBB2, VPBB2, "R1"); VPBlockUtils::connectBlocks(VPBB1, R1); - VPlan Plan(VPPH, &*TC, VPBB1); + + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -38,6 +43,7 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefSameBB) { EXPECT_STREQ("Use before def!\n", ::testing::internal::GetCapturedStderr().c_str()); #endif + delete ScalarHeader; } TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { @@ -60,7 +66,11 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { VPBlockUtils::connectBlocks(VPBB1, R1); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -70,6 +80,7 @@ TEST(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) { EXPECT_STREQ("Use before def!\n", ::testing::internal::GetCapturedStderr().c_str()); #endif + delete ScalarHeader; } TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { @@ -103,7 +114,10 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { VPBB3->setParent(R1); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -115,6 +129,7 @@ TEST(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) { #endif delete Phi; + delete ScalarHeader; } TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { @@ -139,7 +154,11 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { VPBlockUtils::connectBlocks(VPBB1, R1); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -149,6 +168,7 @@ TEST(VPVerifierTest, DuplicateSuccessorsOutsideRegion) { EXPECT_STREQ("Multiple instances of the same successor.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif + delete ScalarHeader; } TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { @@ -176,7 +196,11 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { VPBB3->setParent(R1); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -186,6 +210,7 @@ TEST(VPVerifierTest, DuplicateSuccessorsInsideRegion) { EXPECT_STREQ("Multiple instances of the same successor.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif + delete ScalarHeader; } TEST(VPVerifierTest, BlockOutsideRegionWithParent) { @@ -205,7 +230,11 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) { VPBB1->setParent(R1); auto TC = std::make_unique(); - VPlan Plan(VPPH, &*TC, VPBB1); + LLVMContext C; + auto *ScalarHeader = BasicBlock::Create(C, ""); + VPIRBasicBlock *ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(R1, ScalarHeaderVPBB); + VPlan Plan(VPPH, &*TC, VPBB1, ScalarHeaderVPBB); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); @@ -215,6 +244,7 @@ TEST(VPVerifierTest, BlockOutsideRegionWithParent) { EXPECT_STREQ("Predecessor is not in the same region.\n", ::testing::internal::GetCapturedStderr().c_str()); #endif + delete ScalarHeader; } } // namespace diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp index e3d9d010f9ae3..ade393c11b7a2 100644 --- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp +++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp @@ -1001,7 +1001,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info, char Char = String[i]; if (Variant.BreakCharacters.contains(Char)) { if (InTok) { - addAsmOperand(String.slice(Prev, i), false); + addAsmOperand(String.substr(Prev, i - Prev), false); Prev = i; IsIsolatedToken = false; } @@ -1010,7 +1010,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info, } if (Variant.TokenizingCharacters.contains(Char)) { if (InTok) { - addAsmOperand(String.slice(Prev, i), IsIsolatedToken); + addAsmOperand(String.substr(Prev, i - Prev), IsIsolatedToken); InTok = false; IsIsolatedToken = false; } @@ -1021,7 +1021,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info, } if (Variant.SeparatorCharacters.contains(Char)) { if (InTok) { - addAsmOperand(String.slice(Prev, i), IsIsolatedToken); + addAsmOperand(String.substr(Prev, i - Prev), IsIsolatedToken); InTok = false; } Prev = i + 1; @@ -1032,7 +1032,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info, switch (Char) { case '\\': if (InTok) { - addAsmOperand(String.slice(Prev, i), false); + addAsmOperand(String.substr(Prev, i - Prev), false); InTok = false; IsIsolatedToken = false; } @@ -1045,7 +1045,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info, case '$': { if (InTok) { - addAsmOperand(String.slice(Prev, i), IsIsolatedToken); + addAsmOperand(String.substr(Prev, i - Prev), IsIsolatedToken); InTok = false; IsIsolatedToken = false; } @@ -1059,7 +1059,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info, size_t EndPos = String.find('}', i); assert(EndPos != StringRef::npos && "Missing brace in operand reference!"); - addAsmOperand(String.slice(i, EndPos + 1), IsIsolatedToken); + addAsmOperand(String.substr(i, EndPos + 1 - i), IsIsolatedToken); Prev = EndPos + 1; i = EndPos; IsIsolatedToken = false; diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp index 1c0ab594d9310..f72fe4c6fd562 100644 --- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp +++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp @@ -566,7 +566,8 @@ std::string CodeGenInstruction::FlattenAsmStringVariants(StringRef Cur, } // Select the Nth variant (or empty). - StringRef Selection = Cur.slice(VariantsStart, VariantsEnd); + StringRef Selection = + Cur.substr(VariantsStart, VariantsEnd - VariantsStart); for (unsigned i = 0; i != Variant; ++i) Selection = Selection.split('|').second; Res += Selection.split('|').first; diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp index 8bebe608eece4..e74fc00015b40 100644 --- a/llvm/utils/TableGen/DXILEmitter.cpp +++ b/llvm/utils/TableGen/DXILEmitter.cpp @@ -32,20 +32,6 @@ using namespace llvm::dxil; namespace { -struct DXILArgSelect { - enum class Type { - Index, - I32, - I8, - }; - Type Type = Type::Index; - int Value = -1; -}; -struct DXILIntrinsicSelect { - StringRef Intrinsic; - SmallVector Args; -}; - struct DXILOperationDesc { std::string OpName; // name of DXIL operation int OpCode; // ID of DXIL operation @@ -56,7 +42,8 @@ struct DXILOperationDesc { SmallVector OverloadRecs; SmallVector StageRecs; SmallVector AttrRecs; - SmallVector IntrinsicSelects; + StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which + // means no map exists SmallVector ShaderStages; // shader stages to which this applies, empty for all. int OverloadParamIndex; // Index of parameter with overload type. @@ -84,21 +71,6 @@ static void ascendingSortByVersion(std::vector &Recs) { }); } -/// Take a `int_{intrinsic_name}` and return just the intrinsic_name part if -/// available. Otherwise return the empty string. -static StringRef GetIntrinsicName(const RecordVal *RV) { - if (RV && RV->getValue()) { - if (const DefInit *DI = dyn_cast(RV->getValue())) { - auto *IntrinsicDef = DI->getDef(); - auto DefName = IntrinsicDef->getName(); - assert(DefName.starts_with("int_") && "invalid intrinsic name"); - // Remove the int_ from intrinsic name. - return DefName.substr(4); - } - } - return ""; -} - /// Construct an object using the DXIL Operation records specified /// in DXIL.td. This serves as the single source of reference of /// the information extracted from the specified Record R, for @@ -185,63 +157,14 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) { OpName); } - { - DXILIntrinsicSelect IntrSelect; - IntrSelect.Intrinsic = GetIntrinsicName(R->getValue("LLVMIntrinsic")); - if (IntrSelect.Intrinsic.size()) - IntrinsicSelects.emplace_back(std::move(IntrSelect)); - } - - auto IntrinsicSelectRecords = R->getValueAsListOfDefs("intrinsic_selects"); - if (IntrinsicSelectRecords.size()) { - if (IntrinsicSelects.size()) { - PrintFatalError( - R, Twine("LLVMIntrinsic and intrinsic_selects cannot be both " - "defined for DXIL operation - ") + - OpName); - } else { - for (const Record *R : IntrinsicSelectRecords) { - DXILIntrinsicSelect IntrSelect; - IntrSelect.Intrinsic = GetIntrinsicName(R->getValue("intrinsic")); - auto Args = R->getValueAsListOfDefs("args"); - for (const Record *Arg : Args) { - bool IsI8 = Arg->getValueAsBit("is_i8"); - bool IsI32 = Arg->getValueAsBit("is_i32"); - int Index = Arg->getValueAsInt("index"); - const Record *ValueRec = Arg->getValueAsOptionalDef("value"); - - DXILArgSelect ArgSelect; - if (IsI8) { - if (!ValueRec) { - PrintFatalError(R, Twine("'value' must be defined for i8 " - "ArgSelect for DXIL operation - ") + - OpName); - } - ArgSelect.Type = DXILArgSelect::Type::I8; - ArgSelect.Value = ValueRec->getValueAsInt("value"); - } else if (IsI32) { - if (!ValueRec) { - PrintFatalError(R, Twine("'value' must be defined for i32 " - "ArgSelect for DXIL operation - ") + - OpName); - } - ArgSelect.Type = DXILArgSelect::Type::I32; - ArgSelect.Value = ValueRec->getValueAsInt("value"); - } else { - if (Index < 0) { - PrintFatalError( - R, Twine("Index in ArgSelect must be equal to or " - "greater than 0 for DXIL operation - ") + - OpName); - } - ArgSelect.Type = DXILArgSelect::Type::Index; - ArgSelect.Value = Index; - } - - IntrSelect.Args.emplace_back(std::move(ArgSelect)); - } - IntrinsicSelects.emplace_back(std::move(IntrSelect)); - } + const RecordVal *RV = R->getValue("LLVMIntrinsic"); + if (RV && RV->getValue()) { + if (const DefInit *DI = dyn_cast(RV->getValue())) { + auto *IntrinsicDef = DI->getDef(); + auto DefName = IntrinsicDef->getName(); + assert(DefName.starts_with("int_") && "invalid intrinsic name"); + // Remove the int_ from intrinsic name. + Intrinsic = DefName.substr(4); } } } @@ -454,29 +377,10 @@ static void emitDXILIntrinsicMap(ArrayRef Ops, OS << "#ifdef DXIL_OP_INTRINSIC\n"; OS << "\n"; for (const auto &Op : Ops) { - if (Op.IntrinsicSelects.empty()) { + if (Op.Intrinsic.empty()) continue; - } - for (const DXILIntrinsicSelect &MappedIntr : Op.IntrinsicSelects) { - OS << "DXIL_OP_INTRINSIC(dxil::OpCode::" << Op.OpName - << ", Intrinsic::" << MappedIntr.Intrinsic; - for (const DXILArgSelect &ArgSelect : MappedIntr.Args) { - OS << ", (ArgSelect { "; - switch (ArgSelect.Type) { - case DXILArgSelect::Type::Index: - OS << "ArgSelect::Type::Index, "; - break; - case DXILArgSelect::Type::I8: - OS << "ArgSelect::Type::I8, "; - break; - case DXILArgSelect::Type::I32: - OS << "ArgSelect::Type::I32, "; - break; - } - OS << ArgSelect.Value << "})"; - } - OS << ")\n"; - } + OS << "DXIL_OP_INTRINSIC(dxil::OpCode::" << Op.OpName + << ", Intrinsic::" << Op.Intrinsic << ")\n"; } OS << "\n"; OS << "#undef DXIL_OP_INTRINSIC\n"; diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp index 26b881651ea41..c6cd3da13646a 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -1162,6 +1162,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s, TYPE("vz512mem", TYPE_MVSIBZ) TYPE("BNDR", TYPE_BNDR) TYPE("TILE", TYPE_TMM) + TYPE("TILEPair", TYPE_TMM_PAIR) errs() << "Unhandled type string " << s << "\n"; llvm_unreachable("Unhandled type string"); } @@ -1243,6 +1244,7 @@ RecognizableInstr::rmRegisterEncodingFromString(const std::string &s, ENCODING("VK64", ENCODING_RM) ENCODING("BNDR", ENCODING_RM) ENCODING("TILE", ENCODING_RM) + ENCODING("TILEPair", ENCODING_RM) errs() << "Unhandled R/M register encoding " << s << "\n"; llvm_unreachable("Unhandled R/M register encoding"); } @@ -1292,6 +1294,7 @@ RecognizableInstr::roRegisterEncodingFromString(const std::string &s, ENCODING("VK64WM", ENCODING_REG) ENCODING("BNDR", ENCODING_REG) ENCODING("TILE", ENCODING_REG) + ENCODING("TILEPair", ENCODING_REG) errs() << "Unhandled reg/opcode register encoding " << s << "\n"; llvm_unreachable("Unhandled reg/opcode register encoding"); } @@ -1322,6 +1325,7 @@ RecognizableInstr::vvvvRegisterEncodingFromString(const std::string &s, ENCODING("VK32", ENCODING_VVVV) ENCODING("VK64", ENCODING_VVVV) ENCODING("TILE", ENCODING_VVVV) + ENCODING("TILEPair", ENCODING_VVVV) errs() << "Unhandled VEX.vvvv register encoding " << s << "\n"; llvm_unreachable("Unhandled VEX.vvvv register encoding"); } diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn index e9d9a3be27a6d..c945c8ac42e4a 100644 --- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn @@ -95,6 +95,10 @@ clang_tablegen("BuiltinsRISCV") { args = [ "-gen-clang-builtins" ] } +clang_tablegen("BuiltinsX86") { + args = [ "-gen-clang-builtins" ] +} + # ARM CDE, MVE, and NEON. clang_tablegen("arm_neon") { diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index 1b193af6c30af..31b4ba6304a23 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -26,6 +26,7 @@ static_library("Basic") { "//clang/include/clang/Basic:Builtins", "//clang/include/clang/Basic:BuiltinsBPF", "//clang/include/clang/Basic:BuiltinsRISCV", + "//clang/include/clang/Basic:BuiltinsX86", "//clang/include/clang/Basic:DiagnosticGroups", "//clang/include/clang/Basic:RegularKeywordAttrInfo", "//clang/include/clang/Basic:arm_cde_builtins", diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index b47189accd136..d6e7d5490ce02 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -128,6 +128,7 @@ copy("Headers") { "ammintrin.h", "amxcomplexintrin.h", "amxfp16intrin.h", + "amxfp8intrin.h", "amxintrin.h", "arm64intr.h", "arm_acle.h", diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn index fe211726eb5d5..6a4c92390d773 100644 --- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn @@ -144,7 +144,7 @@ static_library("Checkers") { "WebKit/RefCntblBaseVirtualDtorChecker.cpp", "WebKit/UncountedCallArgsChecker.cpp", "WebKit/UncountedLambdaCapturesChecker.cpp", - "WebKit/UncountedLocalVarsChecker.cpp", + "WebKit/RawPtrRefLocalVarsChecker.cpp", "cert/InvalidPtrChecker.cpp", ] } diff --git a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn index c307615cbca88..9ec69b519c30a 100644 --- a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn @@ -26,6 +26,7 @@ unittest("StaticAnalysisTests") { "ParamRegionTest.cpp", "RangeSetTest.cpp", "RegisterCustomCheckersTest.cpp", + "SValSimplifyerTest.cpp", "SValTest.cpp", "StoreTest.cpp", "SymbolReaperTest.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn index 75c693e446a39..502aa13e1de81 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn @@ -17,6 +17,7 @@ unittest("AMDGPUTests") { include_dirs = [ "//llvm/lib/Target/AMDGPU" ] sources = [ "AMDGPUUnitTests.cpp", + "CSETest.cpp", "DwarfRegMappings.cpp", "ExecMayBeModifiedBeforeAnyUse.cpp", "PALMetadata.cpp", diff --git a/mlir/docs/ActionTracing.md b/mlir/docs/ActionTracing.md index 978fdbbe54d81..984516d5c5e7e 100644 --- a/mlir/docs/ActionTracing.md +++ b/mlir/docs/ActionTracing.md @@ -86,7 +86,7 @@ An action can also carry arbitrary payload, for example we can extend the ```c++ /// A custom Action can be defined minimally by deriving from -/// `tracing::ActionImpl`. It can has any members! +/// `tracing::ActionImpl`. It can have any members! class MyCustomAction : public tracing::ActionImpl { public: using Base = tracing::ActionImpl; diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md index 0ba76199874cc..c61ceaf81681e 100644 --- a/mlir/docs/PatternRewriter.md +++ b/mlir/docs/PatternRewriter.md @@ -320,15 +320,41 @@ conversion target, via a set of pattern-based operation rewriting patterns. This framework also provides support for type conversions. More information on this driver can be found [here](DialectConversion.md). +### Walk Pattern Rewrite Driver + +This is a fast and simple driver that walks the given op and applies patterns +that locally have the most benefit. The benefit of a pattern is decided solely +by the benefit specified on the pattern, and the relative order of the pattern +within the pattern list (when two patterns have the same local benefit). + +The driver performs a post-order traversal. Note that it walks regions of the +given op but does not visit the op. + +This driver does not (re)visit modified or newly replaced ops, and does not +allow for progressive rewrites of the same op. Op and block erasure is only +supported for the currently matched op and its descendant. If your pattern +set requires these, consider using the Greedy Pattern Rewrite Driver instead, +at the expense of extra overhead. + +This driver is exposed using the `walkAndApplyPatterns` function. + +Note: This driver listens for IR changes via the callbacks provided by +`RewriterBase`. It is important that patterns announce all IR changes to the +rewriter and do not bypass the rewriter API by modifying ops directly. + +#### Debugging + +You can debug the Walk Pattern Rewrite Driver by passing the +`--debug-only=walk-rewriter` CLI flag. This will print the visited and matched +ops. + ### Greedy Pattern Rewrite Driver This driver processes ops in a worklist-driven fashion and greedily applies the -patterns that locally have the most benefit. The benefit of a pattern is decided -solely by the benefit specified on the pattern, and the relative order of the -pattern within the pattern list (when two patterns have the same local benefit). -Patterns are iteratively applied to operations until a fixed point is reached or -until the configurable maximum number of iterations exhausted, at which point -the driver finishes. +patterns that locally have the most benefit (same as the Walk Pattern Rewrite +Driver). Patterns are iteratively applied to operations until a fixed point is +reached or until the configurable maximum number of iterations exhausted, at +which point the driver finishes. This driver comes in two fashions: @@ -368,7 +394,7 @@ rewriter and do not bypass the rewriter API by modifying ops directly. Note: This driver is the one used by the [canonicalization](Canonicalization.md) [pass](Passes.md/#-canonicalize) in MLIR. -### Debugging +#### Debugging To debug the execution of the greedy pattern rewrite driver, `-debug-only=greedy-rewriter` may be used. This command line flag activates diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index cba35bbca1f83..69745addfd748 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -254,7 +254,7 @@ def AMDGPU_RawBufferAtomicCmpswapOp : def AMDGPU_RawBufferAtomicFaddOp : AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>, AttrSizedOperandSegments]>, - Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16]>]>:$value, + Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16, BF16]>]>:$value, Arg:$memref, Variadic:$indices, DefaultValuedAttr:$boundsCheck, diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td index 1c70a4b8df925..7bcc3b9e79986 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td @@ -1,4 +1,4 @@ -//===- BufferizationOps.td - Bufferization op definitions ----------*- tablegen -*-===// +//===- BufferizationOps.td - Bufferization op definitions --*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -602,7 +602,7 @@ def Bufferization_DeallocOp : Bufferization_Op<"dealloc", [ ``` Deallocation will be called on `%a0` if `%cond0` is 'true' and neither `%r0`, `%r1`, or `%r2` are aliases of `%a0`. `%a1` will be deallocated when - `%cond1` is set to 'true' and none of `%r0`, %r1`, `%r2`, and `%a0` are + `%cond1` is set to 'true' and none of `%r0`, `%r1`, `%r2`, and `%a0` are aliases. Note that this can be an expensive operation if there are many operands that diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td index cc5463ea968fc..2743de43fb9cf 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -584,7 +584,7 @@ def EmptyTensorElimination : Pass<"eliminate-empty-tensors"> { ``` %0 = tensor.empty() : tensor<10xf32> %1 = linalg.fill ... outs(%0 : tensor<10xf32>) - %2 = tensor.insert_slice %0 into %t ... + %2 = tensor.insert_slice %1 into %t ... ``` In the above example, the subset op is "tensor.insert_slice". When tracing diff --git a/mlir/include/mlir/Dialect/DLTI/DLTIAttrs.td b/mlir/include/mlir/Dialect/DLTI/DLTIAttrs.td index 53d38407608be..d54b3191eed7e 100644 --- a/mlir/include/mlir/Dialect/DLTI/DLTIAttrs.td +++ b/mlir/include/mlir/Dialect/DLTI/DLTIAttrs.td @@ -88,11 +88,15 @@ def DLTI_DataLayoutSpecAttr : /// Returns the attribute associated with the key. FailureOr query(DataLayoutEntryKey key) { - return llvm::cast(*this).queryHelper(key); + return ::llvm::cast(*this).queryHelper(key); } }]; } +//===----------------------------------------------------------------------===// +// MapAttr +//===----------------------------------------------------------------------===// + def DLTI_MapAttr : DLTIAttr<"Map", [DLTIQueryInterface]> { let summary = "A mapping of DLTI-information by way of key-value pairs"; let description = [{ @@ -106,18 +110,16 @@ def DLTI_MapAttr : DLTIAttr<"Map", [DLTIQueryInterface]> { Consider the following flat encoding of a single-key dictionary ``` - #dlti.map<#dlti.dl_entry<"CPU::cache::L1::size_in_bytes", 65536 : i32>> + #dlti.map<"CPU::cache::L1::size_in_bytes" = 65536 : i32>> ``` versus nested maps, which make it possible to obtain sub-dictionaries of related information (with the following example making use of other attributes that also implement the `DLTIQueryInterface`): ``` - #dlti.target_system_spec<"CPU": - #dlti.target_device_spec<#dlti.dl_entry<"cache", - #dlti.map<#dlti.dl_entry<"L1", - #dlti.map<#dlti.dl_entry<"size_in_bytes", 65536 : i32>>>, - #dlti.dl_entry<"L1d", - #dlti.map<#dlti.dl_entry<"size_in_bytes", 32768 : i32>>> >>>> + #dlti.target_system_spec<"CPU" = + #dlti.target_device_spec<"cache" = + #dlti.map<"L1" = #dlti.map<"size_in_bytes" = 65536 : i32>, + "L1d" = #dlti.map<"size_in_bytes" = 32768 : i32> >>> ``` With the flat encoding, the implied structure of the key is ignored, that is @@ -132,14 +134,13 @@ def DLTI_MapAttr : DLTIAttr<"Map", [DLTIQueryInterface]> { `transform.dlti.query ["CPU","cache","L1","size_in_bytes"] at %op` gives back the first leaf value contained. To access the other leaf, we need to do `transform.dlti.query ["CPU","cache","L1d","size_in_bytes"] at %op`. - ``` }]; let parameters = (ins ArrayRefParameter<"DataLayoutEntryInterface", "">:$entries ); let mnemonic = "map"; let genVerifyDecl = 1; - let assemblyFormat = "`<` $entries `>`"; + let hasCustomAssemblyFormat = 1; let extraClassDeclaration = [{ /// Returns the attribute associated with the key. FailureOr query(DataLayoutEntryKey key) { @@ -167,20 +168,23 @@ def DLTI_TargetSystemSpecAttr : ``` dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< - #dlti.dl_entry<"dlti.L1_cache_size_in_bytes", 4096: ui32>>, - "GPU": #dlti.target_device_spec< - #dlti.dl_entry<"dlti.max_vector_op_width", 64 : ui32>>, - "XPU": #dlti.target_device_spec< - #dlti.dl_entry<"dlti.max_vector_op_width", 4096 : ui32>>> + "CPU" = #dlti.target_device_spec< + "L1_cache_size_in_bytes" = 4096: ui32>, + "GPU" = #dlti.target_device_spec< + "max_vector_op_width" = 64 : ui32>, + "XPU" = #dlti.target_device_spec< + "max_vector_op_width" = 4096 : ui32>> ``` + + The verifier checks that keys are strings and pointed to values implement + DLTI's TargetDeviceSpecInterface. }]; let parameters = (ins - ArrayRefParameter<"DeviceIDTargetDeviceSpecPair", "">:$entries + ArrayRefParameter<"DataLayoutEntryInterface">:$entries ); let mnemonic = "target_system_spec"; let genVerifyDecl = 1; - let assemblyFormat = "`<` $entries `>`"; + let hasCustomAssemblyFormat = 1; let extraClassDeclaration = [{ /// Return the device specification that matches the given device ID std::optional @@ -189,7 +193,7 @@ def DLTI_TargetSystemSpecAttr : /// Returns the attribute associated with the key. FailureOr query(DataLayoutEntryKey key) const { - return llvm::cast(*this).queryHelper(key); + return ::llvm::cast(*this).queryHelper(key); } }]; let extraClassDefinition = [{ @@ -197,8 +201,10 @@ def DLTI_TargetSystemSpecAttr : $cppClass::getDeviceSpecForDeviceID( TargetSystemSpecInterface::DeviceID deviceID) { for (const auto& entry : getEntries()) { - if (entry.first == deviceID) - return entry.second; + if (entry.getKey() == DataLayoutEntryKey(deviceID)) + if (auto deviceSpec = + ::llvm::dyn_cast(entry.getValue())) + return deviceSpec; } return std::nullopt; } @@ -219,21 +225,20 @@ def DLTI_TargetDeviceSpecAttr : Example: ``` - #dlti.target_device_spec< - #dlti.dl_entry<"dlti.max_vector_op_width", 64 : ui32>> + #dlti.target_device_spec<"max_vector_op_width" = 64 : ui32> ``` }]; let parameters = (ins - ArrayRefParameter<"DataLayoutEntryInterface", "">:$entries + ArrayRefParameter<"DataLayoutEntryInterface">:$entries ); let mnemonic = "target_device_spec"; let genVerifyDecl = 1; - let assemblyFormat = "`<` $entries `>`"; + let hasCustomAssemblyFormat = 1; let extraClassDeclaration = [{ /// Returns the attribute associated with the key. FailureOr query(DataLayoutEntryKey key) const { - return llvm::cast(*this).queryHelper(key); + return ::llvm::cast(*this).queryHelper(key); } }]; } diff --git a/mlir/include/mlir/Dialect/Func/Transforms/DecomposeCallGraphTypes.h b/mlir/include/mlir/Dialect/Func/Transforms/DecomposeCallGraphTypes.h index 1d311b37b37a4..1be406bf3adf9 100644 --- a/mlir/include/mlir/Dialect/Func/Transforms/DecomposeCallGraphTypes.h +++ b/mlir/include/mlir/Dialect/Func/Transforms/DecomposeCallGraphTypes.h @@ -23,70 +23,10 @@ namespace mlir { -/// This class provides a hook that expands one Value into multiple Value's, -/// with a TypeConverter-inspired callback registration mechanism. -/// -/// For folks that are familiar with the dialect conversion framework / -/// TypeConverter, this is effectively the inverse of a source/argument -/// materialization. A target materialization is not what we want here because -/// it always produces a single Value, but in this case the whole point is to -/// decompose a Value into multiple Value's. -/// -/// The reason we need this inverse is easily understood by looking at what we -/// need to do for decomposing types for a return op. When converting a return -/// op, the dialect conversion framework will give the list of converted -/// operands, and will ensure that each converted operand, even if it expanded -/// into multiple types, is materialized as a single result. We then need to -/// undo that materialization to a single result, which we do with the -/// decomposeValue hooks registered on this object. -/// -/// TODO: Eventually, the type conversion infra should have this hook built-in. -/// See -/// https://llvm.discourse.group/t/extending-type-conversion-infrastructure/779/2 -class ValueDecomposer { -public: - /// This method tries to decompose a value of a certain type using provided - /// decompose callback functions. If it is unable to do so, the original value - /// is returned. - void decomposeValue(OpBuilder &, Location, Type, Value, - SmallVectorImpl &); - - /// This method registers a callback function that will be called to decompose - /// a value of a certain type into 0, 1, or multiple values. - template >::template arg_t<2>> - void addDecomposeValueConversion(FnT &&callback) { - decomposeValueConversions.emplace_back( - wrapDecomposeValueConversionCallback(std::forward(callback))); - } - -private: - using DecomposeValueConversionCallFn = - std::function( - OpBuilder &, Location, Type, Value, SmallVectorImpl &)>; - - /// Generate a wrapper for the given decompose value conversion callback. - template - DecomposeValueConversionCallFn - wrapDecomposeValueConversionCallback(FnT &&callback) { - return - [callback = std::forward(callback)]( - OpBuilder &builder, Location loc, Type type, Value value, - SmallVectorImpl &newValues) -> std::optional { - if (T derivedType = dyn_cast(type)) - return callback(builder, loc, derivedType, value, newValues); - return std::nullopt; - }; - } - - SmallVector decomposeValueConversions; -}; - /// Populates the patterns needed to drive the conversion process for -/// decomposing call graph types with the given `ValueDecomposer`. +/// decomposing call graph types with the given `TypeConverter`. void populateDecomposeCallGraphTypesPatterns(MLIRContext *context, const TypeConverter &typeConverter, - ValueDecomposer &decomposer, RewritePatternSet &patterns); } // namespace mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index 855fa5702f5b9..e8eeafd09a9cb 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -721,6 +721,22 @@ def LLVM_DICommonBlockAttr : LLVM_Attr<"DICommonBlock", "di_common_block", let assemblyFormat = "`<` struct(params) `>`"; } +//===----------------------------------------------------------------------===// +// DIGenericSubrangeAttr +//===----------------------------------------------------------------------===// + +def LLVM_DIGenericSubrangeAttr : LLVM_Attr<"DIGenericSubrange", + "di_generic_subrange", /*traits=*/[], + "DINodeAttr"> { + let parameters = (ins + OptionalParameter<"::mlir::Attribute">:$count, + "::mlir::Attribute":$lowerBound, + OptionalParameter<"::mlir::Attribute">:$upperBound, + "::mlir::Attribute":$stride + ); + let assemblyFormat = "`<` struct(params) `>`"; +} + //===----------------------------------------------------------------------===// // DISubroutineTypeAttr //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h index 96247e9b081d2..f40405773ee87 100644 --- a/mlir/include/mlir/IR/PatternMatch.h +++ b/mlir/include/mlir/IR/PatternMatch.h @@ -461,54 +461,60 @@ class RewriterBase : public OpBuilder { /// struct can be used as a base to create listener chains, so that multiple /// listeners can be notified of IR changes. struct ForwardingListener : public RewriterBase::Listener { - ForwardingListener(OpBuilder::Listener *listener) : listener(listener) {} + ForwardingListener(OpBuilder::Listener *listener) + : listener(listener), + rewriteListener( + dyn_cast_if_present(listener)) {} void notifyOperationInserted(Operation *op, InsertPoint previous) override { - listener->notifyOperationInserted(op, previous); + if (listener) + listener->notifyOperationInserted(op, previous); } void notifyBlockInserted(Block *block, Region *previous, Region::iterator previousIt) override { - listener->notifyBlockInserted(block, previous, previousIt); + if (listener) + listener->notifyBlockInserted(block, previous, previousIt); } void notifyBlockErased(Block *block) override { - if (auto *rewriteListener = dyn_cast(listener)) + if (rewriteListener) rewriteListener->notifyBlockErased(block); } void notifyOperationModified(Operation *op) override { - if (auto *rewriteListener = dyn_cast(listener)) + if (rewriteListener) rewriteListener->notifyOperationModified(op); } void notifyOperationReplaced(Operation *op, Operation *newOp) override { - if (auto *rewriteListener = dyn_cast(listener)) + if (rewriteListener) rewriteListener->notifyOperationReplaced(op, newOp); } void notifyOperationReplaced(Operation *op, ValueRange replacement) override { - if (auto *rewriteListener = dyn_cast(listener)) + if (rewriteListener) rewriteListener->notifyOperationReplaced(op, replacement); } void notifyOperationErased(Operation *op) override { - if (auto *rewriteListener = dyn_cast(listener)) + if (rewriteListener) rewriteListener->notifyOperationErased(op); } void notifyPatternBegin(const Pattern &pattern, Operation *op) override { - if (auto *rewriteListener = dyn_cast(listener)) + if (rewriteListener) rewriteListener->notifyPatternBegin(pattern, op); } void notifyPatternEnd(const Pattern &pattern, LogicalResult status) override { - if (auto *rewriteListener = dyn_cast(listener)) + if (rewriteListener) rewriteListener->notifyPatternEnd(pattern, status); } void notifyMatchFailure( Location loc, function_ref reasonCallback) override { - if (auto *rewriteListener = dyn_cast(listener)) + if (rewriteListener) rewriteListener->notifyMatchFailure(loc, reasonCallback); } private: OpBuilder::Listener *listener; + RewriterBase::Listener *rewriteListener; }; /// Move the blocks that belong to "region" before the given position in diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h index 848d2dee4a630..7a7b659724f86 100644 --- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h +++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h @@ -15,6 +15,7 @@ #ifndef MLIR_INTERFACES_DATALAYOUTINTERFACES_H #define MLIR_INTERFACES_DATALAYOUTINTERFACES_H +#include "mlir/IR/Attributes.h" #include "mlir/IR/DialectInterface.h" #include "mlir/IR/OpDefinition.h" #include "llvm/ADT/DenseMap.h" @@ -32,10 +33,7 @@ using DataLayoutEntryKey = llvm::PointerUnion; using DataLayoutEntryList = llvm::SmallVector; using DataLayoutEntryListRef = llvm::ArrayRef; using TargetDeviceSpecListRef = llvm::ArrayRef; -using DeviceIDTargetDeviceSpecPair = - std::pair; -using DeviceIDTargetDeviceSpecPairListRef = - llvm::ArrayRef; +using TargetDeviceSpecEntry = std::pair; class DataLayoutOpInterface; class DataLayoutSpecInterface; class ModuleOp; diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td index d6e955be4291a..3532116700af5 100644 --- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td +++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td @@ -276,7 +276,7 @@ def TargetDeviceSpecInterface : AttrInterface<"TargetDeviceSpecInterface", [DLTI /// Helper for default implementation of `DLTIQueryInterface`'s `query`. ::mlir::FailureOr<::mlir::Attribute> queryHelper(::mlir::DataLayoutEntryKey key) const { - if (auto strKey = llvm::dyn_cast(key)) + if (auto strKey = ::llvm::dyn_cast(key)) if (DataLayoutEntryInterface spec = getSpecForIdentifier(strKey)) return spec.getValue(); return ::mlir::failure(); @@ -304,7 +304,7 @@ def TargetSystemSpecInterface : AttrInterface<"TargetSystemSpecInterface", [DLTI let methods = [ InterfaceMethod< /*description=*/"Returns the list of layout entries.", - /*retTy=*/"llvm::ArrayRef", + /*retTy=*/"::llvm::ArrayRef", /*methodName=*/"getEntries", /*args=*/(ins) >, @@ -334,7 +334,7 @@ def TargetSystemSpecInterface : AttrInterface<"TargetSystemSpecInterface", [DLTI /// Helper for default implementation of `DLTIQueryInterface`'s `query`. ::mlir::FailureOr<::mlir::Attribute> queryHelper(::mlir::DataLayoutEntryKey key) const { - if (auto strKey = llvm::dyn_cast<::mlir::StringAttr>(key)) + if (auto strKey = ::llvm::dyn_cast<::mlir::StringAttr>(key)) if (auto deviceSpec = getDeviceSpecForDeviceID(strKey)) return *deviceSpec; return ::mlir::failure(); diff --git a/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h b/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h new file mode 100644 index 0000000000000..6d62ae3dd43dc --- /dev/null +++ b/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h @@ -0,0 +1,37 @@ +//===- WALKPATTERNREWRITEDRIVER.h - Walk Pattern Rewrite Driver -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declares a helper function to walk the given op and apply rewrite patterns. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TRANSFORMS_WALKPATTERNREWRITEDRIVER_H_ +#define MLIR_TRANSFORMS_WALKPATTERNREWRITEDRIVER_H_ + +#include "mlir/IR/Visitors.h" +#include "mlir/Rewrite/FrozenRewritePatternSet.h" + +namespace mlir { + +/// A fast walk-based pattern rewrite driver. Rewrites ops nested under the +/// given operation by walking it and applying the highest benefit patterns. +/// This rewriter *does not* wait until a fixpoint is reached and *does not* +/// visit modified or newly replaced ops. Also *does not* perform folding or +/// dead-code elimination. +/// +/// This is intended as the simplest and most lightweight pattern rewriter in +/// cases when a simple walk gets the job done. +/// +/// Note: Does not apply patterns to the given operation itself. +void walkAndApplyPatterns(Operation *op, + const FrozenRewritePatternSet &patterns, + RewriterBase::Listener *listener = nullptr); + +} // namespace mlir + +#endif // MLIR_TRANSFORMS_WALKPATTERNREWRITEDRIVER_H_ diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp index e9de3062d41c7..1204f1c069b1e 100644 --- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp +++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp @@ -2510,7 +2510,7 @@ LogicalResult BytecodeReader::Impl::defineValues(EncodingReader &reader, } Value BytecodeReader::Impl::createForwardRef() { - // Check for an avaliable existing operation to use. Otherwise, create a new + // Check for an available existing operation to use. Otherwise, create a new // fake operation to use for the reference. if (!openForwardRefOps.empty()) { Operation *op = &openForwardRefOps.back(); diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 0ccd4133d3761..5a7897f233eaa 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -98,15 +98,8 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern { // bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32 // and the total load size is >= 32, use a vector load of N / (bitsize(T) / // 32) x i32 and bitcast. Also, the CAS intrinsic requires integer operands, - // so bitcast any floats to integers. On top of all this, cast bfloat - // (vectors) to i16 since the backend doesn't currently support bfloat on - // these operations. + // so bitcast any floats to integers. Type llvmBufferValType = llvmWantedDataType; - if (wantedDataType.isBF16()) - llvmBufferValType = rewriter.getI16Type(); - if (auto wantedVecType = dyn_cast(wantedDataType)) - if (wantedVecType.getElementType().isBF16()) - llvmBufferValType = wantedVecType.clone(rewriter.getI16Type()); if (atomicCmpData) { if (auto floatType = dyn_cast(wantedDataType)) llvmBufferValType = this->getTypeConverter()->convertType( diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp index 269bf1225bed9..985fcd351535c 100644 --- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp +++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp @@ -42,6 +42,19 @@ struct ConvertMemRefToEmitCPass populateMemRefToEmitCTypeConversion(converter); populateEmitCSizeTTypeConversions(converter); + auto materializeAsUnrealizedCast = [](OpBuilder &builder, Type resultType, + ValueRange inputs, + Location loc) -> Value { + if (inputs.size() != 1) + return Value(); + + return builder.create(loc, resultType, inputs) + .getResult(0); + }; + + converter.addSourceMaterialization(materializeAsUnrealizedCast); + converter.addTargetMaterialization(materializeAsUnrealizedCast); + RewritePatternSet patterns(&getContext()); populateMemRefToEmitCConversionPatterns(patterns, converter); diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index ee2b86f8f3505..5cdd0b2a02af5 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -1274,6 +1274,9 @@ class RescaleConverter : public OpConversionPattern { return rewriter.notifyMatchFailure( op, "tosa.rescale requires scale32 for double_round to be true"); + if (!isa(inputTy.getElementType())) + return rewriter.notifyMatchFailure(op, "only support integer type"); + SmallVector dynDims; for (int i = 0; i < outputTy.getRank(); i++) { if (outputTy.isDynamicDim(i)) { diff --git a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp index c8cfcc3d945be..46c7bfbf3ffcc 100644 --- a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp +++ b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp @@ -37,40 +37,38 @@ std::pair getTileSizes(ConversionPatternRewriter &rewriter, rewriter.create(loc, llvmInt16Type, nattr)); } -/// Verifies if the stride matches proper tile access. -LogicalResult verifyStride(MemRefType mType) { - if (mType.getRank() < 2) - return failure(); - int64_t last = mType.getRank() - 1; - int64_t offset; - SmallVector strides; - if (failed(getStridesAndOffset(mType, strides, offset)) || strides[last] != 1) - return failure(); - return success(); -} - /// Maps the 2-dim memref shape to the 64-bit stride. Note that the buffer /// shape may "envelop" the actual tile shape, and may be dynamically sized. -Value getStride(ConversionPatternRewriter &rewriter, - const LLVMTypeConverter &typeConverter, MemRefType mType, - Value base, Location loc) { - assert(mType.getRank() >= 2); - int64_t last = mType.getRank() - 1; +/// Returns failure if proper stride couldn't be found. +FailureOr getStride(ConversionPatternRewriter &rewriter, + const LLVMTypeConverter &typeConverter, + MemRefType mType, Value base, Location loc) { + if (mType.getRank() < 2) + return failure(); + int64_t preLast = mType.getRank() - 2; Type llvmInt64Type = IntegerType::get(&typeConverter.getContext(), 64); unsigned width = mType.getElementType().getIntOrFloatBitWidth(); assert(llvm::isPowerOf2_64(width) && width >= 8); unsigned bytes = width >> 3; - if (mType.isDynamicDim(last)) { - // Dynamic size needs code to compute the stride at runtime. + int64_t offset; + SmallVector strides; + if (failed(getStridesAndOffset(mType, strides, offset)) || + strides.back() != 1) + return failure(); + if (strides[preLast] == ShapedType::kDynamic) { + // Dynamic stride needs code to compute the stride at runtime. MemRefDescriptor memrefDescriptor(base); auto attr = rewriter.getI64IntegerAttr(bytes); Value scale = rewriter.create(loc, llvmInt64Type, attr); - return rewriter.create( - loc, llvmInt64Type, scale, memrefDescriptor.size(rewriter, loc, last)); + return rewriter + .create(loc, llvmInt64Type, scale, + memrefDescriptor.stride(rewriter, loc, preLast)) + .getResult(); } - // Use direct constant for static size. - auto attr = rewriter.getI64IntegerAttr(mType.getDimSize(last) * bytes); - return rewriter.create(loc, llvmInt64Type, attr); + // Use direct constant for static stride. + auto attr = rewriter.getI64IntegerAttr(strides[preLast] * bytes); + return rewriter.create(loc, llvmInt64Type, attr) + .getResult(); } struct TileZeroConversion : public ConvertOpToLLVMPattern { @@ -102,16 +100,16 @@ struct TileLoadConversion : public ConvertOpToLLVMPattern { std::pair tsz = getTileSizes(rewriter, *getTypeConverter(), vType, op.getLoc()); // Determine stride. - if (failed(verifyStride(mType))) + auto stride = getStride(rewriter, *getTypeConverter(), mType, + adaptor.getBase(), op.getLoc()); + if (failed(stride)) return failure(); - Value stride = getStride(rewriter, *getTypeConverter(), mType, - adaptor.getBase(), op.getLoc()); // Replace operation with intrinsic. Value ptr = getStridedElementPtr(op.getLoc(), mType, adaptor.getBase(), adaptor.getIndices(), rewriter); Type resType = typeConverter->convertType(vType); rewriter.replaceOpWithNewOp( - op, resType, tsz.first, tsz.second, ptr, stride); + op, resType, tsz.first, tsz.second, ptr, stride.value()); return success(); } }; @@ -128,15 +126,15 @@ struct TileStoreConversion : public ConvertOpToLLVMPattern { std::pair tsz = getTileSizes(rewriter, *getTypeConverter(), vType, op.getLoc()); // Determine stride. - if (failed(verifyStride(mType))) + auto stride = getStride(rewriter, *getTypeConverter(), mType, + adaptor.getBase(), op.getLoc()); + if (failed(stride)) return failure(); - Value stride = getStride(rewriter, *getTypeConverter(), mType, - adaptor.getBase(), op.getLoc()); // Replace operation with intrinsic. Value ptr = getStridedElementPtr(op.getLoc(), mType, adaptor.getBase(), adaptor.getIndices(), rewriter); rewriter.replaceOpWithNewOp( - op, tsz.first, tsz.second, ptr, stride, adaptor.getVal()); + op, tsz.first, tsz.second, ptr, stride.value(), adaptor.getVal()); return success(); } }; diff --git a/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp b/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp index bebe0b5a7c0b6..8922e93e399f9 100644 --- a/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp @@ -14,7 +14,7 @@ #include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/IR/PatternMatch.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/WalkPatternRewriteDriver.h" namespace mlir { namespace arith { @@ -157,11 +157,7 @@ struct ArithUnsignedWhenEquivalentPass RewritePatternSet patterns(ctx); populateUnsignedWhenEquivalentPatterns(patterns, solver); - GreedyRewriteConfig config; - config.listener = &listener; - - if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config))) - signalPassFailure(); + walkAndApplyPatterns(op, std::move(patterns), &listener); } }; } // end anonymous namespace diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp index 85ec9fc93248a..508e50d42e4cf 100644 --- a/mlir/lib/Dialect/DLTI/DLTI.cpp +++ b/mlir/lib/Dialect/DLTI/DLTI.cpp @@ -8,6 +8,7 @@ #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinDialect.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/BuiltinTypes.h" @@ -28,6 +29,134 @@ using namespace mlir; #define DEBUG_TYPE "dlti" +//===----------------------------------------------------------------------===// +// Common parsing utility functions. +//===----------------------------------------------------------------------===// + +/// Parse an entry which can either be of the form `key = value` or a +/// #dlti.dl_entry attribute. When `tryType=true` the key can be a type, +/// otherwise only quoted strings are allowed. The grammar is as follows: +/// entry ::= ((type | quoted-string) `=` attr) | dl-entry-attr +static ParseResult parseKeyValuePair(AsmParser &parser, + DataLayoutEntryInterface &entry, + bool tryType = false) { + Attribute value; + + if (tryType) { + Type type; + OptionalParseResult parsedType = parser.parseOptionalType(type); + if (parsedType.has_value()) { + if (failed(parsedType.value())) + return parser.emitError(parser.getCurrentLocation()) + << "error while parsing type DLTI key"; + + if (failed(parser.parseEqual()) || failed(parser.parseAttribute(value))) + return failure(); + + entry = DataLayoutEntryAttr::get(type, value); + return ParseResult::success(); + } + } + + std::string ident; + OptionalParseResult parsedStr = parser.parseOptionalString(&ident); + if (parsedStr.has_value() && succeeded(parsedStr.value())) { + if (failed(parser.parseEqual()) || failed(parser.parseAttribute(value))) + return failure(); // Assume that an error has already been emitted. + + entry = DataLayoutEntryAttr::get( + StringAttr::get(parser.getContext(), ident), value); + return ParseResult::success(); + } + + OptionalParseResult parsedEntry = parser.parseAttribute(entry); + if (parsedEntry.has_value()) { + if (succeeded(parsedEntry.value())) + return parsedEntry.value(); + return failure(); // Assume that an error has already been emitted. + } + return parser.emitError(parser.getCurrentLocation()) + << "failed to parse DLTI entry"; +} + +/// Construct a requested attribute by parsing list of entries occurring within +/// a pair of `<` and `>`, optionally allow types as keys and an empty list. +/// The grammar is as follows: +/// bracketed-entry-list ::=`<` entry-list `>` +/// entry-list ::= | entry | entry `,` entry-list +/// entry ::= ((type | quoted-string) `=` attr) | dl-entry-attr +template +static Attribute parseAngleBracketedEntries(AsmParser &parser, Type ty, + bool tryType = false, + bool allowEmpty = false) { + SmallVector entries; + if (failed(parser.parseCommaSeparatedList( + AsmParser::Delimiter::LessGreater, [&]() { + return parseKeyValuePair(parser, entries.emplace_back(), tryType); + }))) + return {}; + + if (entries.empty() && !allowEmpty) { + parser.emitError(parser.getNameLoc()) << "no DLTI entries provided"; + return {}; + } + + return Attr::getChecked([&] { return parser.emitError(parser.getNameLoc()); }, + parser.getContext(), ArrayRef(entries)); +} + +//===----------------------------------------------------------------------===// +// Common printing utility functions. +//===----------------------------------------------------------------------===// + +/// Convert pointer-union keys to strings. +static std::string keyToStr(DataLayoutEntryKey key) { + std::string buf; + TypeSwitch(key) + .Case( // The only two kinds of key we know of. + [&](auto key) { llvm::raw_string_ostream(buf) << key; }); + return buf; +} + +/// Pretty-print entries, each in `key = value` format, separated by commas. +template +static void printAngleBracketedEntries(AsmPrinter &os, T &&entries) { + os << "<"; + llvm::interleaveComma(std::forward(entries), os, [&](auto entry) { + os << keyToStr(entry.getKey()) << " = " << entry.getValue(); + }); + os << ">"; +} + +//===----------------------------------------------------------------------===// +// Common verifying utility functions. +//===----------------------------------------------------------------------===// + +/// Verify entries, with the option to disallow types as keys. +static LogicalResult verifyEntries(function_ref emitError, + ArrayRef entries, + bool allowTypes = true) { + DenseSet keys; + for (DataLayoutEntryInterface entry : entries) { + if (!entry) + return emitError() << "contained invalid DLTI entry"; + DataLayoutEntryKey key = entry.getKey(); + if (key.isNull()) + return emitError() << "contained invalid DLTI key"; + if (!allowTypes && dyn_cast(key)) + return emitError() << "type as DLTI key is not allowed"; + if (auto strKey = dyn_cast(key)) + if (strKey.getValue().empty()) + return emitError() << "empty string as DLTI key is not allowed"; + if (!keys.insert(key).second) + return emitError() << "repeated DLTI key: " << keyToStr(key); + if (!entry.getValue()) + return emitError() << "value associated to DLTI key " << keyToStr(key) + << " is invalid"; + } + return success(); +} + //===----------------------------------------------------------------------===// // DataLayoutEntryAttr //===----------------------------------------------------------------------===// @@ -71,15 +200,16 @@ DataLayoutEntryKey DataLayoutEntryAttr::getKey() const { Attribute DataLayoutEntryAttr::getValue() const { return getImpl()->value; } /// Parses an attribute with syntax: -/// attr ::= `#target.` `dl_entry` `<` (type | quoted-string) `,` attr `>` -Attribute DataLayoutEntryAttr::parse(AsmParser &parser, Type ty) { +/// dl-entry-attr ::= `#dlti.` `dl_entry` `<` (type | quoted-string) `,` +/// attr `>` +Attribute DataLayoutEntryAttr::parse(AsmParser &parser, Type type) { if (failed(parser.parseLess())) return {}; - Type type = nullptr; + Type typeKey = nullptr; std::string identifier; SMLoc idLoc = parser.getCurrentLocation(); - OptionalParseResult parsedType = parser.parseOptionalType(type); + OptionalParseResult parsedType = parser.parseOptionalType(typeKey); if (parsedType.has_value() && failed(parsedType.value())) return {}; if (!parsedType.has_value()) { @@ -95,38 +225,29 @@ Attribute DataLayoutEntryAttr::parse(AsmParser &parser, Type ty) { failed(parser.parseGreater())) return {}; - return type ? get(type, value) - : get(parser.getBuilder().getStringAttr(identifier), value); + return typeKey ? get(typeKey, value) + : get(parser.getBuilder().getStringAttr(identifier), value); } -void DataLayoutEntryAttr::print(AsmPrinter &os) const { - os << "<"; - if (auto type = llvm::dyn_cast_if_present(getKey())) - os << type; - else - os << "\"" << getKey().get().strref() << "\""; - os << ", " << getValue() << ">"; +void DataLayoutEntryAttr::print(AsmPrinter &printer) const { + printer << "<" << keyToStr(getKey()) << ", " << getValue() << ">"; } //===----------------------------------------------------------------------===// // DLTIMapAttr //===----------------------------------------------------------------------===// -static LogicalResult verifyEntries(function_ref emitError, - ArrayRef entries) { - DenseSet types; - DenseSet ids; - for (DataLayoutEntryInterface entry : entries) { - if (auto type = llvm::dyn_cast_if_present(entry.getKey())) { - if (!types.insert(type).second) - return emitError() << "repeated layout entry key: " << type; - } else { - auto id = entry.getKey().get(); - if (!ids.insert(id).second) - return emitError() << "repeated layout entry key: " << id.getValue(); - } - } - return success(); +/// Parses an attribute with syntax: +/// map-attr ::= `#dlti.` `map` `<` entry-list `>` +/// entry-list ::= entry | entry `,` entry-list +/// entry ::= ((type | quoted-string) `=` attr) | dl-entry-attr +Attribute MapAttr::parse(AsmParser &parser, Type type) { + return parseAngleBracketedEntries(parser, type, /*tryType=*/true, + /*allowEmpty=*/true); +} + +void MapAttr::print(AsmPrinter &printer) const { + printAngleBracketedEntries(printer, getEntries()); } LogicalResult MapAttr::verify(function_ref emitError, @@ -196,7 +317,7 @@ combineOneSpec(DataLayoutSpecInterface spec, typeSample.getContext()->getLoadedDialect() && "unexpected data layout entry for built-in type"); - auto interface = llvm::cast(typeSample); + auto interface = cast(typeSample); if (!interface.areCompatible(entriesForType.lookup(kvp.first), kvp.second)) return failure(); @@ -230,7 +351,7 @@ DataLayoutSpecAttr DataLayoutSpecAttr::combineWith(ArrayRef specs) const { // Only combine with attributes of the same kind. // TODO: reconsider this when the need arises. - if (llvm::any_of(specs, [](DataLayoutSpecInterface spec) { + if (any_of(specs, [](DataLayoutSpecInterface spec) { return !llvm::isa(spec); })) return {}; @@ -282,98 +403,40 @@ DataLayoutSpecAttr::getStackAlignmentIdentifier(MLIRContext *context) const { DLTIDialect::kDataLayoutStackAlignmentKey); } -/// Parses an attribute with syntax -/// attr ::= `#target.` `dl_spec` `<` attr-list? `>` -/// attr-list ::= attr -/// | attr `,` attr-list +/// Parses an attribute with syntax: +/// dl-spec-attr ::= `#dlti.` `dl_spec` `<` entry-list `>` +/// entry-list ::= | entry | entry `,` entry-list +/// entry ::= ((type | quoted-string) = attr) | dl-entry-attr Attribute DataLayoutSpecAttr::parse(AsmParser &parser, Type type) { - if (failed(parser.parseLess())) - return {}; - - // Empty spec. - if (succeeded(parser.parseOptionalGreater())) - return get(parser.getContext(), {}); - - SmallVector entries; - if (parser.parseCommaSeparatedList( - [&]() { return parser.parseAttribute(entries.emplace_back()); }) || - parser.parseGreater()) - return {}; - - return getChecked([&] { return parser.emitError(parser.getNameLoc()); }, - parser.getContext(), entries); + return parseAngleBracketedEntries(parser, type, + /*tryType=*/true, + /*allowEmpty=*/true); } -void DataLayoutSpecAttr::print(AsmPrinter &os) const { - os << "<"; - llvm::interleaveComma(getEntries(), os); - os << ">"; +void DataLayoutSpecAttr::print(AsmPrinter &printer) const { + printAngleBracketedEntries(printer, getEntries()); } //===----------------------------------------------------------------------===// // TargetDeviceSpecAttr //===----------------------------------------------------------------------===// -namespace mlir { -/// A FieldParser for key-value pairs of DeviceID-target device spec pairs that -/// make up a target system spec. -template <> -struct FieldParser { - static FailureOr parse(AsmParser &parser) { - std::string deviceID; - - if (failed(parser.parseString(&deviceID))) { - parser.emitError(parser.getCurrentLocation()) - << "DeviceID is missing, or is not of string type"; - return failure(); - } - - if (failed(parser.parseColon())) { - parser.emitError(parser.getCurrentLocation()) << "Missing colon"; - return failure(); - } - - auto target_device_spec = - FieldParser::parse(parser); - if (failed(target_device_spec)) { - parser.emitError(parser.getCurrentLocation()) - << "Error in parsing target device spec"; - return failure(); - } - - return std::make_pair(parser.getBuilder().getStringAttr(deviceID), - *target_device_spec); - } -}; - -inline AsmPrinter &operator<<(AsmPrinter &printer, - DeviceIDTargetDeviceSpecPair param) { - return printer << param.first << " : " << param.second; -} - -} // namespace mlir - LogicalResult TargetDeviceSpecAttr::verify(function_ref emitError, ArrayRef entries) { - // Entries in a target device spec can only have StringAttr as key. It does - // not support type as a key. Hence not reusing - // DataLayoutEntryInterface::verify. - DenseSet ids; - for (DataLayoutEntryInterface entry : entries) { - if (auto type = llvm::dyn_cast_if_present(entry.getKey())) { - return emitError() - << "dlti.target_device_spec does not allow type as a key: " - << type; - } else { - // Check that keys in a target device spec are unique. - auto id = entry.getKey().get(); - if (!ids.insert(id).second) - return emitError() << "repeated layout entry key: " << id.getValue(); - } - } + return verifyEntries(emitError, entries, /*allowTypes=*/false); +} - return success(); +/// Parses an attribute with syntax: +/// dev-spec-attr ::= `#dlti.` `target_device_spec` `<` entry-list `>` +/// entry-list ::= entry | entry `,` entry-list +/// entry ::= (quoted-string `=` attr) | dl-entry-attr +Attribute TargetDeviceSpecAttr::parse(AsmParser &parser, Type type) { + return parseAngleBracketedEntries(parser, type); +} + +void TargetDeviceSpecAttr::print(AsmPrinter &printer) const { + printAngleBracketedEntries(printer, getEntries()); } //===----------------------------------------------------------------------===// @@ -382,27 +445,46 @@ TargetDeviceSpecAttr::verify(function_ref emitError, LogicalResult TargetSystemSpecAttr::verify(function_ref emitError, - ArrayRef entries) { - DenseSet device_ids; + ArrayRef entries) { + DenseSet deviceIds; for (const auto &entry : entries) { - TargetDeviceSpecInterface target_device_spec = entry.second; - - // First verify that a target device spec is valid. - if (failed(TargetDeviceSpecAttr::verify(emitError, - target_device_spec.getEntries()))) - return failure(); + auto deviceId = + llvm::dyn_cast(entry.getKey()); + if (!deviceId) + return emitError() << "non-string key of DLTI system spec"; + + if (auto targetDeviceSpec = + llvm::dyn_cast(entry.getValue())) { + if (failed(TargetDeviceSpecAttr::verify(emitError, + targetDeviceSpec.getEntries()))) + return failure(); // Assume sub-verifier outputted error message. + } else { + return emitError() << "value associated with key " << deviceId + << " is not a DLTI device spec"; + } // Check that device IDs are unique across all entries. - TargetSystemSpecInterface::DeviceID device_id = entry.first; - if (!device_ids.insert(device_id).second) { - return emitError() << "repeated Device ID in dlti.target_system_spec: " - << device_id; - } + if (!deviceIds.insert(deviceId).second) + return emitError() << "repeated device ID in dlti.target_system_spec: " + << deviceId; } + return success(); } +/// Parses an attribute with syntax: +/// sys-spec-attr ::= `#dlti.` `target_system_spec` `<` entry-list `>` +/// entry-list ::= entry | entry `,` entry-list +/// entry ::= (quoted-string `=` dev-spec-attr) | dl-entry-attr +Attribute TargetSystemSpecAttr::parse(AsmParser &parser, Type type) { + return parseAngleBracketedEntries(parser, type); +} + +void TargetSystemSpecAttr::print(AsmPrinter &printer) const { + printAngleBracketedEntries(printer, getEntries()); +} + //===----------------------------------------------------------------------===// // DLTIDialect //===----------------------------------------------------------------------===// @@ -417,7 +499,7 @@ getClosestQueryable(Operation *op) { // Search op and its ancestors for the first attached DLTIQueryInterface attr. do { for (NamedAttribute attr : op->getAttrs()) - if ((queryable = llvm::dyn_cast(attr.getValue()))) + if ((queryable = dyn_cast(attr.getValue()))) break; } while (!queryable && (op = op->getParentOp())); @@ -446,18 +528,9 @@ dlti::query(Operation *op, ArrayRef keys, bool emitError) { return failure(); } - auto keyToStr = [](DataLayoutEntryKey key) -> std::string { - std::string buf; - llvm::TypeSwitch(key) - .Case( // The only two kinds of key we know of. - [&](auto key) { llvm::raw_string_ostream(buf) << key; }) - .Default([](auto) { llvm_unreachable("unexpected entry key kind"); }); - return buf; - }; - Attribute currentAttr = queryable; for (auto &&[idx, key] : llvm::enumerate(keys)) { - if (auto map = llvm::dyn_cast(currentAttr)) { + if (auto map = dyn_cast(currentAttr)) { auto maybeAttr = map.query(key); if (failed(maybeAttr)) { if (emitError) { @@ -503,7 +576,7 @@ class TargetDataLayoutInterface : public DataLayoutDialectInterface { Location loc) const final { StringRef entryName = entry.getKey().get().strref(); if (entryName == DLTIDialect::kDataLayoutEndiannessKey) { - auto value = llvm::dyn_cast(entry.getValue()); + auto value = dyn_cast(entry.getValue()); if (value && (value.getValue() == DLTIDialect::kDataLayoutEndiannessBig || value.getValue() == DLTIDialect::kDataLayoutEndiannessLittle)) diff --git a/mlir/lib/Dialect/Func/Transforms/DecomposeCallGraphTypes.cpp b/mlir/lib/Dialect/Func/Transforms/DecomposeCallGraphTypes.cpp index 357f993710a26..de4aba2ed327d 100644 --- a/mlir/lib/Dialect/Func/Transforms/DecomposeCallGraphTypes.cpp +++ b/mlir/lib/Dialect/Func/Transforms/DecomposeCallGraphTypes.cpp @@ -14,52 +14,48 @@ using namespace mlir; using namespace mlir::func; //===----------------------------------------------------------------------===// -// ValueDecomposer +// Helper functions //===----------------------------------------------------------------------===// -void ValueDecomposer::decomposeValue(OpBuilder &builder, Location loc, - Type type, Value value, - SmallVectorImpl &results) { - for (auto &conversion : decomposeValueConversions) - if (conversion(builder, loc, type, value, results)) - return; - results.push_back(value); +/// If the given value can be decomposed with the type converter, decompose it. +/// Otherwise, return the given value. +// TODO: Value decomposition should happen automatically through a 1:N adaptor. +// This function will disappear when the 1:1 and 1:N drivers are merged. +static SmallVector decomposeValue(OpBuilder &builder, Location loc, + Value value, + const TypeConverter *converter) { + // Try to convert the given value's type. If that fails, just return the + // given value. + SmallVector convertedTypes; + if (failed(converter->convertType(value.getType(), convertedTypes))) + return {value}; + if (convertedTypes.empty()) + return {}; + + // If the given value's type is already legal, just return the given value. + TypeRange convertedTypeRange(convertedTypes); + if (convertedTypeRange == TypeRange(value.getType())) + return {value}; + + // Try to materialize a target conversion. If the materialization did not + // produce values of the requested type, the materialization failed. Just + // return the given value in that case. + SmallVector result = converter->materializeTargetConversion( + builder, loc, convertedTypeRange, value); + if (result.empty()) + return {value}; + return result; } -//===----------------------------------------------------------------------===// -// DecomposeCallGraphTypesOpConversionPattern -//===----------------------------------------------------------------------===// - -namespace { -/// Base OpConversionPattern class to make a ValueDecomposer available to -/// inherited patterns. -template -class DecomposeCallGraphTypesOpConversionPattern - : public OpConversionPattern { -public: - DecomposeCallGraphTypesOpConversionPattern(const TypeConverter &typeConverter, - MLIRContext *context, - ValueDecomposer &decomposer, - PatternBenefit benefit = 1) - : OpConversionPattern(typeConverter, context, benefit), - decomposer(decomposer) {} - -protected: - ValueDecomposer &decomposer; -}; -} // namespace - //===----------------------------------------------------------------------===// // DecomposeCallGraphTypesForFuncArgs //===----------------------------------------------------------------------===// namespace { -/// Expand function arguments according to the provided TypeConverter and -/// ValueDecomposer. +/// Expand function arguments according to the provided TypeConverter. struct DecomposeCallGraphTypesForFuncArgs - : public DecomposeCallGraphTypesOpConversionPattern { - using DecomposeCallGraphTypesOpConversionPattern:: - DecomposeCallGraphTypesOpConversionPattern; + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(func::FuncOp op, OpAdaptor adaptor, @@ -100,19 +96,22 @@ struct DecomposeCallGraphTypesForFuncArgs //===----------------------------------------------------------------------===// namespace { -/// Expand return operands according to the provided TypeConverter and -/// ValueDecomposer. +/// Expand return operands according to the provided TypeConverter. struct DecomposeCallGraphTypesForReturnOp - : public DecomposeCallGraphTypesOpConversionPattern { - using DecomposeCallGraphTypesOpConversionPattern:: - DecomposeCallGraphTypesOpConversionPattern; + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(ReturnOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const final { SmallVector newOperands; - for (Value operand : adaptor.getOperands()) - decomposer.decomposeValue(rewriter, op.getLoc(), operand.getType(), - operand, newOperands); + for (Value operand : adaptor.getOperands()) { + // TODO: We can directly take the values from the adaptor once this is a + // 1:N conversion pattern. + llvm::append_range(newOperands, + decomposeValue(rewriter, operand.getLoc(), operand, + getTypeConverter())); + } rewriter.replaceOpWithNewOp(op, newOperands); return success(); } @@ -124,12 +123,9 @@ struct DecomposeCallGraphTypesForReturnOp //===----------------------------------------------------------------------===// namespace { -/// Expand call op operands and results according to the provided TypeConverter -/// and ValueDecomposer. -struct DecomposeCallGraphTypesForCallOp - : public DecomposeCallGraphTypesOpConversionPattern { - using DecomposeCallGraphTypesOpConversionPattern:: - DecomposeCallGraphTypesOpConversionPattern; +/// Expand call op operands and results according to the provided TypeConverter. +struct DecomposeCallGraphTypesForCallOp : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(CallOp op, OpAdaptor adaptor, @@ -137,9 +133,13 @@ struct DecomposeCallGraphTypesForCallOp // Create the operands list of the new `CallOp`. SmallVector newOperands; - for (Value operand : adaptor.getOperands()) - decomposer.decomposeValue(rewriter, op.getLoc(), operand.getType(), - operand, newOperands); + for (Value operand : adaptor.getOperands()) { + // TODO: We can directly take the values from the adaptor once this is a + // 1:N conversion pattern. + llvm::append_range(newOperands, + decomposeValue(rewriter, operand.getLoc(), operand, + getTypeConverter())); + } // Create the new result types for the new `CallOp` and track the indices in // the new call op's results that correspond to the old call op's results. @@ -189,9 +189,8 @@ struct DecomposeCallGraphTypesForCallOp void mlir::populateDecomposeCallGraphTypesPatterns( MLIRContext *context, const TypeConverter &typeConverter, - ValueDecomposer &decomposer, RewritePatternSet &patterns) { + RewritePatternSet &patterns) { patterns .add(typeConverter, context, - decomposer); + DecomposeCallGraphTypesForReturnOp>(typeConverter, context); } diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp index 9640bbdf28df4..ee4e344674a67 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp @@ -56,13 +56,13 @@ void LLVMDialect::registerAttributes() { //===----------------------------------------------------------------------===// bool DINodeAttr::classof(Attribute attr) { - return llvm::isa( + return llvm::isa< + DIBasicTypeAttr, DICommonBlockAttr, DICompileUnitAttr, + DICompositeTypeAttr, DIDerivedTypeAttr, DIFileAttr, DIGenericSubrangeAttr, + DIGlobalVariableAttr, DIImportedEntityAttr, DILabelAttr, + DILexicalBlockAttr, DILexicalBlockFileAttr, DILocalVariableAttr, + DIModuleAttr, DINamespaceAttr, DINullTypeAttr, DIAnnotationAttr, + DIStringTypeAttr, DISubprogramAttr, DISubrangeAttr, DISubroutineTypeAttr>( attr); } diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index e1df647d6a3c7..4a27a5ed8eb74 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -2499,13 +2499,7 @@ void OrderedRegionOp::build(OpBuilder &builder, OperationState &state, OrderedRegionOp::build(builder, state, clauses.parLevelSimd); } -LogicalResult OrderedRegionOp::verify() { - // TODO: The code generation for ordered simd directive is not supported yet. - if (getParLevelSimd()) - return failure(); - - return verifyOrderedParent(**this); -} +LogicalResult OrderedRegionOp::verify() { return verifyOrderedParent(**this); } //===----------------------------------------------------------------------===// // TaskwaitOp diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp index e390a613b5807..93e8cac6b84e9 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp @@ -543,7 +543,6 @@ bool TosaValidation::isValidElementType(Type type) { case 16: case 32: case 48: - case 64: return true; default: return false; diff --git a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp index 2158953c07110..9469780129d64 100644 --- a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp +++ b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp @@ -790,13 +790,22 @@ mlir::detail::verifyTargetSystemSpec(TargetSystemSpecInterface spec, DenseMap deviceDescKeys; DenseSet deviceIDs; for (const auto &entry : spec.getEntries()) { - TargetDeviceSpecInterface targetDeviceSpec = entry.second; + auto targetDeviceSpec = + dyn_cast(entry.getValue()); + + if (!targetDeviceSpec) + return failure(); + // First, verify individual target device desc specs. if (failed(targetDeviceSpec.verifyEntry(loc))) return failure(); // Check that device IDs are unique across all entries. - TargetSystemSpecInterface::DeviceID deviceID = entry.first; + auto deviceID = + llvm::dyn_cast(entry.getKey()); + if (!deviceID) + return failure(); + if (!deviceIDs.insert(deviceID).second) { return failure(); } diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp index 412125b6ea65f..779c1d12720c1 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp +++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp @@ -309,6 +309,35 @@ DICommonBlockAttr DebugImporter::translateImpl(llvm::DICommonBlock *node) { translate(node->getFile()), node->getLineNo()); } +DIGenericSubrangeAttr +DebugImporter::translateImpl(llvm::DIGenericSubrange *node) { + auto getAttrOrNull = + [&](llvm::DIGenericSubrange::BoundType data) -> Attribute { + if (data.isNull()) + return nullptr; + if (auto *expr = dyn_cast(data)) + return translateExpression(expr); + if (auto *var = dyn_cast(data)) { + if (auto *local = dyn_cast(var)) + return translate(local); + if (auto *global = dyn_cast(var)) + return translate(global); + return nullptr; + } + return nullptr; + }; + Attribute count = getAttrOrNull(node->getCount()); + Attribute upperBound = getAttrOrNull(node->getUpperBound()); + Attribute lowerBound = getAttrOrNull(node->getLowerBound()); + Attribute stride = getAttrOrNull(node->getStride()); + // Either count or the upper bound needs to be present. Otherwise, the + // metadata is invalid. + if (!count && !upperBound) + return {}; + return DIGenericSubrangeAttr::get(context, count, lowerBound, upperBound, + stride); +} + DISubroutineTypeAttr DebugImporter::translateImpl(llvm::DISubroutineType *node) { SmallVector types; @@ -378,6 +407,8 @@ DINodeAttr DebugImporter::translate(llvm::DINode *node) { return translateImpl(casted); if (auto *casted = dyn_cast(node)) return translateImpl(casted); + if (auto *casted = dyn_cast(node)) + return translateImpl(casted); if (auto *casted = dyn_cast(node)) return translateImpl(casted); return nullptr; diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.h b/mlir/lib/Target/LLVMIR/DebugImporter.h index a452e01a9f604..b224ad3376be7 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.h +++ b/mlir/lib/Target/LLVMIR/DebugImporter.h @@ -79,6 +79,7 @@ class DebugImporter { DIScopeAttr translateImpl(llvm::DIScope *node); DISubprogramAttr translateImpl(llvm::DISubprogram *node); DISubrangeAttr translateImpl(llvm::DISubrange *node); + DIGenericSubrangeAttr translateImpl(llvm::DIGenericSubrange *node); DICommonBlockAttr translateImpl(llvm::DICommonBlock *node); DISubroutineTypeAttr translateImpl(llvm::DISubroutineType *node); DITypeAttr translateImpl(llvm::DIType *node); diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp index 2491db299af31..cf734de49acd6 100644 --- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp @@ -404,6 +404,33 @@ llvm::DICommonBlock *DebugTranslation::translateImpl(DICommonBlockAttr attr) { translate(attr.getFile()), attr.getLine()); } +llvm::DIGenericSubrange * +DebugTranslation::translateImpl(DIGenericSubrangeAttr attr) { + auto getMetadataOrNull = [&](Attribute attr) -> llvm::Metadata * { + if (!attr) + return nullptr; + + llvm::Metadata *metadata = + llvm::TypeSwitch(attr) + .Case([&](LLVM::DIExpressionAttr expr) { + return translateExpression(expr); + }) + .Case([&](LLVM::DILocalVariableAttr local) { + return translate(local); + }) + .Case<>([&](LLVM::DIGlobalVariableAttr global) { + return translate(global); + }) + .Default([&](Attribute attr) { return nullptr; }); + return metadata; + }; + return llvm::DIGenericSubrange::get(llvmCtx, + getMetadataOrNull(attr.getCount()), + getMetadataOrNull(attr.getLowerBound()), + getMetadataOrNull(attr.getUpperBound()), + getMetadataOrNull(attr.getStride())); +} + llvm::DISubroutineType * DebugTranslation::translateImpl(DISubroutineTypeAttr attr) { // Concatenate the result and argument types into a single array. @@ -437,11 +464,11 @@ llvm::DINode *DebugTranslation::translate(DINodeAttr attr) { node = TypeSwitch(attr) .Case( + DIGenericSubrangeAttr, DIGlobalVariableAttr, + DIImportedEntityAttr, DILabelAttr, DILexicalBlockAttr, + DILexicalBlockFileAttr, DILocalVariableAttr, DIModuleAttr, + DINamespaceAttr, DINullTypeAttr, DIStringTypeAttr, + DISubprogramAttr, DISubrangeAttr, DISubroutineTypeAttr>( [&](auto attr) { return translateImpl(attr); }); if (node && !node->isTemporary()) diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.h b/mlir/lib/Target/LLVMIR/DebugTranslation.h index ff4eaa46c564e..930e6a2672136 100644 --- a/mlir/lib/Target/LLVMIR/DebugTranslation.h +++ b/mlir/lib/Target/LLVMIR/DebugTranslation.h @@ -87,6 +87,7 @@ class DebugTranslation { llvm::DINamespace *translateImpl(DINamespaceAttr attr); llvm::DIScope *translateImpl(DIScopeAttr attr); llvm::DISubprogram *translateImpl(DISubprogramAttr attr); + llvm::DIGenericSubrange *translateImpl(DIGenericSubrangeAttr attr); llvm::DISubrange *translateImpl(DISubrangeAttr attr); llvm::DICommonBlock *translateImpl(DICommonBlockAttr attr); llvm::DISubroutineType *translateImpl(DISubroutineTypeAttr attr); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index d20e5e40076bc..dca29f55661b0 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -89,8 +89,257 @@ class OpenMPVarMappingStackFrame DenseMap mapping; }; + +/// Custom error class to signal translation errors that don't need reporting, +/// since encountering them will have already triggered relevant error messages. +/// +/// Its purpose is to serve as the glue between MLIR failures represented as +/// \see LogicalResult instances and \see llvm::Error instances used to +/// propagate errors through the \see llvm::OpenMPIRBuilder. Generally, when an +/// error of the first type is raised, a message is emitted directly (the \see +/// LogicalResult itself does not hold any information). If we need to forward +/// this error condition as an \see llvm::Error while avoiding triggering some +/// redundant error reporting later on, we need a custom \see llvm::ErrorInfo +/// class to just signal this situation has happened. +/// +/// For example, this class should be used to trigger errors from within +/// callbacks passed to the \see OpenMPIRBuilder when they were triggered by the +/// translation of their own regions. This unclutters the error log from +/// redundant messages. +class PreviouslyReportedError + : public llvm::ErrorInfo { +public: + void log(raw_ostream &) const override { + // Do not log anything. + } + + std::error_code convertToErrorCode() const override { + llvm_unreachable( + "PreviouslyReportedError doesn't support ECError conversion"); + } + + // Used by ErrorInfo::classID. + static char ID; +}; + +char PreviouslyReportedError::ID = 0; + } // namespace +/// Looks up from the operation from and returns the PrivateClauseOp with +/// name symbolName +static omp::PrivateClauseOp findPrivatizer(Operation *from, + SymbolRefAttr symbolName) { + omp::PrivateClauseOp privatizer = + SymbolTable::lookupNearestSymbolFrom(from, + symbolName); + assert(privatizer && "privatizer not found in the symbol table"); + return privatizer; +} + +/// Check whether translation to LLVM IR for the given operation is currently +/// supported. If not, descriptive diagnostics will be emitted to let users know +/// this is a not-yet-implemented feature. +/// +/// \returns success if no unimplemented features are needed to translate the +/// given operation. +static LogicalResult checkImplementationStatus(Operation &op) { + auto todo = [&op](StringRef clauseName) { + return op.emitError(clauseName + " clause not yet supported"); + }; + + auto checkAligned = [&todo](auto op, LogicalResult &result) { + if (!op.getAlignedVars().empty() || op.getAlignments()) + result = todo("aligned"); + }; + auto checkAllocate = [&todo](auto op, LogicalResult &result) { + if (!op.getAllocateVars().empty() || !op.getAllocatorVars().empty()) + result = todo("allocate"); + }; + auto checkDepend = [&todo](auto op, LogicalResult &result) { + if (!op.getDependVars().empty() || op.getDependKinds()) + result = todo("depend"); + }; + auto checkDevice = [&todo](auto op, LogicalResult &result) { + if (op.getDevice()) + result = todo("device"); + }; + auto checkHasDeviceAddr = [&todo](auto op, LogicalResult &result) { + if (!op.getHasDeviceAddrVars().empty()) + result = todo("has_device_addr"); + }; + auto checkHint = [](auto op, LogicalResult &) { + if (op.getHint()) + op.emitWarning("hint clause discarded"); + }; + auto checkIf = [&todo](auto op, LogicalResult &result) { + if (op.getIfExpr()) + result = todo("if"); + }; + auto checkInReduction = [&todo](auto op, LogicalResult &result) { + if (!op.getInReductionVars().empty() || op.getInReductionByref() || + op.getInReductionSyms()) + result = todo("in_reduction"); + }; + auto checkIsDevicePtr = [&todo](auto op, LogicalResult &result) { + if (!op.getIsDevicePtrVars().empty()) + result = todo("is_device_ptr"); + }; + auto checkLinear = [&todo](auto op, LogicalResult &result) { + if (!op.getLinearVars().empty() || !op.getLinearStepVars().empty()) + result = todo("linear"); + }; + auto checkMergeable = [&todo](auto op, LogicalResult &result) { + if (op.getMergeable()) + result = todo("mergeable"); + }; + auto checkNontemporal = [&todo](auto op, LogicalResult &result) { + if (!op.getNontemporalVars().empty()) + result = todo("nontemporal"); + }; + auto checkNowait = [&todo](auto op, LogicalResult &result) { + if (op.getNowait()) + result = todo("nowait"); + }; + auto checkOrder = [&todo](auto op, LogicalResult &result) { + if (op.getOrder() || op.getOrderMod()) + result = todo("order"); + }; + auto checkParLevelSimd = [&todo](auto op, LogicalResult &result) { + if (op.getParLevelSimd()) + result = todo("parallelization-level"); + }; + auto checkPriority = [&todo](auto op, LogicalResult &result) { + if (op.getPriority()) + result = todo("priority"); + }; + auto checkPrivate = [&todo](auto op, LogicalResult &result) { + if (!op.getPrivateVars().empty() || op.getPrivateSyms()) + result = todo("privatization"); + }; + auto checkReduction = [&todo](auto op, LogicalResult &result) { + if (!op.getReductionVars().empty() || op.getReductionByref() || + op.getReductionSyms()) + result = todo("reduction"); + }; + auto checkThreadLimit = [&todo](auto op, LogicalResult &result) { + if (op.getThreadLimit()) + result = todo("thread_limit"); + }; + auto checkTaskReduction = [&todo](auto op, LogicalResult &result) { + if (!op.getTaskReductionVars().empty() || op.getTaskReductionByref() || + op.getTaskReductionSyms()) + result = todo("task_reduction"); + }; + auto checkUntied = [&todo](auto op, LogicalResult &result) { + if (op.getUntied()) + result = todo("untied"); + }; + + LogicalResult result = success(); + llvm::TypeSwitch(op) + .Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); }) + .Case([&](omp::SectionsOp op) { + checkAllocate(op, result); + checkPrivate(op, result); + }) + .Case([&](omp::SingleOp op) { + checkAllocate(op, result); + checkPrivate(op, result); + }) + .Case([&](omp::TeamsOp op) { + checkAllocate(op, result); + checkPrivate(op, result); + checkReduction(op, result); + }) + .Case([&](omp::TaskOp op) { + checkAllocate(op, result); + checkInReduction(op, result); + checkMergeable(op, result); + checkPriority(op, result); + checkPrivate(op, result); + checkUntied(op, result); + }) + .Case([&](omp::TaskgroupOp op) { + checkAllocate(op, result); + checkTaskReduction(op, result); + }) + .Case([&](omp::TaskwaitOp op) { + checkDepend(op, result); + checkNowait(op, result); + }) + .Case([&](omp::WsloopOp op) { + checkAllocate(op, result); + checkLinear(op, result); + checkOrder(op, result); + checkPrivate(op, result); + }) + .Case([&](omp::ParallelOp op) { checkAllocate(op, result); }) + .Case([&](omp::SimdOp op) { + checkAligned(op, result); + checkLinear(op, result); + checkNontemporal(op, result); + checkPrivate(op, result); + checkReduction(op, result); + }) + .Case([&](auto op) { checkHint(op, result); }) + .Case( + [&](auto op) { checkDepend(op, result); }) + .Case([&](omp::TargetOp op) { + checkAllocate(op, result); + checkDevice(op, result); + checkHasDeviceAddr(op, result); + checkIf(op, result); + checkInReduction(op, result); + checkIsDevicePtr(op, result); + // Privatization clauses are supported, except on some situations, so we + // need to check here whether any of these unsupported cases are being + // translated. + if (std::optional privateSyms = op.getPrivateSyms()) { + for (Attribute privatizerNameAttr : *privateSyms) { + omp::PrivateClauseOp privatizer = findPrivatizer( + op.getOperation(), cast(privatizerNameAttr)); + + if (privatizer.getDataSharingType() == + omp::DataSharingClauseType::FirstPrivate) + result = todo("firstprivate"); + + if (!privatizer.getDeallocRegion().empty()) + result = + op.emitError("privatization of structures not yet supported"); + } + } + checkThreadLimit(op, result); + }) + .Default([](Operation &) { + // Assume all clauses for an operation can be translated unless they are + // checked above. + }); + return result; +} + +static LogicalResult handleError(llvm::Error error, Operation &op) { + LogicalResult result = success(); + if (error) { + llvm::handleAllErrors( + std::move(error), + [&](const PreviouslyReportedError &) { result = failure(); }, + [&](const llvm::ErrorInfoBase &err) { + result = op.emitError(err.message()); + }); + } + return result; +} + +template +static LogicalResult handleError(llvm::Expected &result, Operation &op) { + if (!result) + return handleError(result.takeError(), op); + + return success(); +} + /// Find the insertion point for allocas given the current insertion point for /// normal operations in the builder. static llvm::OpenMPIRBuilder::InsertPointTy @@ -216,7 +465,7 @@ static llvm::Expected convertOmpOpRegions( llvm::IRBuilderBase::InsertPointGuard guard(builder); if (failed( moduleTranslation.convertBlock(*bb, bb->isEntryBlock(), builder))) - return llvm::createStringError("failed region translation"); + return llvm::make_error(); // Special handling for `omp.yield` and `omp.terminator` (we may have more // than one): they return the control to the parent OpenMP dialect operation @@ -287,6 +536,7 @@ convertIgnoredWrapper(omp::LoopWrapperInterface &opInst, forwardArgs(blockArgIface.getPrivateBlockArgs(), op.getPrivateVars()); forwardArgs(blockArgIface.getReductionBlockArgs(), op.getReductionVars()); + op.emitWarning() << "simd information on composite construct discarded"; return success(); }) .Default([&](Operation *op) { @@ -325,6 +575,9 @@ convertOmpMasked(Operation &opInst, llvm::IRBuilderBase &builder, auto maskedOp = cast(opInst); using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; + if (failed(checkImplementationStatus(opInst))) + return failure(); + auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { // MaskedOp has only one region associated with it. auto ®ion = maskedOp.getRegion(); @@ -352,8 +605,8 @@ convertOmpMasked(Operation &opInst, llvm::IRBuilderBase &builder, moduleTranslation.getOpenMPBuilder()->createMasked(ompLoc, bodyGenCB, finiCB, filterVal); - if (!afterIP) - return opInst.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, opInst))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -364,9 +617,14 @@ static LogicalResult convertOmpMaster(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; + auto masterOp = cast(opInst); + + if (failed(checkImplementationStatus(opInst))) + return failure(); + auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { // MasterOp has only one region associated with it. - auto ®ion = cast(opInst).getRegion(); + auto ®ion = masterOp.getRegion(); builder.restoreIP(codeGenIP); return convertOmpOpRegions(region, "omp.master.region", builder, moduleTranslation) @@ -382,8 +640,8 @@ convertOmpMaster(Operation &opInst, llvm::IRBuilderBase &builder, moduleTranslation.getOpenMPBuilder()->createMaster(ompLoc, bodyGenCB, finiCB); - if (!afterIP) - return opInst.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, opInst))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -396,6 +654,9 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder, using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; auto criticalOp = cast(opInst); + if (failed(checkImplementationStatus(opInst))) + return failure(); + auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { // CriticalOp has only one region associated with it. auto ®ion = cast(opInst).getRegion(); @@ -429,24 +690,13 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder, moduleTranslation.getOpenMPBuilder()->createCritical( ompLoc, bodyGenCB, finiCB, criticalOp.getName().value_or(""), hint); - if (!afterIP) - return opInst.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, opInst))) + return failure(); builder.restoreIP(*afterIP); return success(); } -/// Looks up from the operation from and returns the PrivateClauseOp with -/// name symbolName -static omp::PrivateClauseOp findPrivatizer(Operation *from, - SymbolRefAttr symbolName) { - omp::PrivateClauseOp privatizer = - SymbolTable::lookupNearestSymbolFrom(from, - symbolName); - assert(privatizer && "privatizer not found in the symbol table"); - return privatizer; -} - /// Populates `privatizations` with privatization declarations used for the /// given op. /// TODO: generalise beyond ParallelOp @@ -537,9 +787,8 @@ static LogicalResult inlineConvertOmpRegions( llvm::Expected continuationBlock = convertOmpOpRegions(region, blockName, builder, moduleTranslation, &phis); - if (!continuationBlock) - return region.getParentOp()->emitError( - llvm::toString(continuationBlock.takeError())); + if (failed(handleError(continuationBlock, *region.getParentOp()))) + return failure(); if (continuationBlockArgs) llvm::append_range(*continuationBlockArgs, phis); @@ -582,7 +831,8 @@ makeReductionGen(omp::DeclareReductionOp decl, llvm::IRBuilderBase &builder, if (failed(inlineConvertOmpRegions(decl.getReductionRegion(), "omp.reduction.nonatomic.body", builder, moduleTranslation, &phis))) - return llvm::createStringError("failed reduction region translation"); + return llvm::createStringError( + "failed to inline `combiner` region of `omp.declare_reduction`"); assert(phis.size() == 1); result = phis[0]; return builder.saveIP(); @@ -615,7 +865,8 @@ makeAtomicReductionGen(omp::DeclareReductionOp decl, if (failed(inlineConvertOmpRegions(decl.getAtomicReductionRegion(), "omp.reduction.atomic.body", builder, moduleTranslation, &phis))) - return llvm::createStringError("failed reduction region translation"); + return llvm::createStringError( + "failed to inline `atomic` region of `omp.declare_reduction`"); assert(phis.empty()); return builder.saveIP(); }; @@ -628,6 +879,9 @@ convertOmpOrdered(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { auto orderedOp = cast(opInst); + if (failed(checkImplementationStatus(opInst))) + return failure(); + omp::ClauseDepend dependType = *orderedOp.getDoacrossDependType(); bool isDependSource = dependType == omp::ClauseDepend::dependsource; unsigned numLoops = *orderedOp.getDoacrossNumLoops(); @@ -659,8 +913,7 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder, using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; auto orderedRegionOp = cast(opInst); - // TODO: The code generation for ordered simd directive is not supported yet. - if (orderedRegionOp.getParLevelSimd()) + if (failed(checkImplementationStatus(opInst))) return failure(); auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { @@ -681,8 +934,8 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder, moduleTranslation.getOpenMPBuilder()->createOrderedThreadsSimd( ompLoc, bodyGenCB, finiCB, !orderedRegionOp.getParLevelSimd()); - if (!afterIP) - return opInst.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, opInst))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -728,9 +981,10 @@ allocReductionVars(T loop, ArrayRef reductionArgs, SmallVector phis; if (failed(inlineConvertOmpRegions(allocRegion, "omp.reduction.alloc", builder, moduleTranslation, &phis))) - return failure(); - assert(phis.size() == 1 && "expected one allocation to be yielded"); + return loop.emitError( + "failed to inline `alloc` region of `omp.declare_reduction`"); + assert(phis.size() == 1 && "expected one allocation to be yielded"); builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); // Allocate reduction variable (which is a pointer to the real reduction @@ -887,8 +1141,8 @@ static LogicalResult createReductionsAndCleanup( ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos, isByRef, op.getNowait()); - if (!contInsertPoint) - return op.emitError(llvm::toString(contInsertPoint.takeError())); + if (failed(handleError(contInsertPoint, *op))) + return failure(); if (!contInsertPoint->getBlock()) return op->emitOpError() << "failed to convert reductions"; @@ -896,8 +1150,8 @@ static LogicalResult createReductionsAndCleanup( llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = ompBuilder->createBarrier(*contInsertPoint, llvm::omp::OMPD_for); - if (!afterIP) - return op.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, *op))) + return failure(); tempTerminator->eraseFromParent(); builder.restoreIP(*afterIP); @@ -1005,12 +1259,8 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, auto sectionsOp = cast(opInst); - // TODO: Support the following clauses: private, firstprivate, lastprivate, - // allocate - if (!sectionsOp.getAllocateVars().empty() || - !sectionsOp.getAllocatorVars().empty() || - !sectionsOp.getPrivateVars().empty() || sectionsOp.getPrivateSyms()) - return opInst.emitError("unhandled clauses for translation to LLVM IR"); + if (failed(checkImplementationStatus(opInst))) + return failure(); llvm::ArrayRef isByRef = getIsByRef(sectionsOp.getReductionByref()); assert(isByRef.size() == sectionsOp.getNumReductionVars()); @@ -1100,8 +1350,8 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, ompLoc, allocaIP, sectionCBs, privCB, finiCB, false, sectionsOp.getNowait()); - if (!afterIP) - return opInst.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, opInst))) + return failure(); builder.restoreIP(*afterIP); @@ -1118,8 +1368,8 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder, using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); - if (!singleOp.getPrivateVars().empty() || singleOp.getPrivateSyms()) - return singleOp.emitError("unhandled clauses for translation to LLVM IR"); + if (failed(checkImplementationStatus(*singleOp))) + return failure(); auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { builder.restoreIP(codegenIP); @@ -1147,8 +1397,8 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder, ompLoc, bodyCB, finiCB, singleOp.getNowait(), llvmCPVars, llvmCPFuncs); - if (!afterIP) - return singleOp.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, *singleOp))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -1159,9 +1409,8 @@ static LogicalResult convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; - if (!op.getAllocatorVars().empty() || op.getReductionSyms() || - !op.getPrivateVars().empty() || op.getPrivateSyms()) - return op.emitError("unhandled clauses for translation to LLVM IR"); + if (failed(checkImplementationStatus(*op))) + return failure(); auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { LLVM::ModuleTranslation::SaveStack frame( @@ -1193,8 +1442,8 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, moduleTranslation.getOpenMPBuilder()->createTeams( ompLoc, bodyCB, numTeamsLower, numTeamsUpper, threadLimit, ifExpr); - if (!afterIP) - return op.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, *op))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -1226,17 +1475,15 @@ buildDependData(std::optional dependKinds, OperandRange dependVars, dds.emplace_back(dd); } } + /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; - if (taskOp.getUntiedAttr() || taskOp.getMergeableAttr() || - taskOp.getInReductionSyms() || taskOp.getPriority() || - !taskOp.getAllocateVars().empty() || !taskOp.getPrivateVars().empty() || - taskOp.getPrivateSyms()) { - return taskOp.emitError("unhandled clauses for translation to LLVM IR"); - } + if (failed(checkImplementationStatus(*taskOp))) + return failure(); + auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { // Save the alloca insertion point on ModuleTranslation stack for use in // nested regions. @@ -1262,8 +1509,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, moduleTranslation.lookupValue(taskOp.getFinal()), moduleTranslation.lookupValue(taskOp.getIfExpr()), dds); - if (!afterIP) - return taskOp.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, *taskOp))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -1274,8 +1521,8 @@ static LogicalResult convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; - if (!tgOp.getTaskReductionVars().empty() || !tgOp.getAllocateVars().empty()) - return tgOp.emitError("unhandled clauses for translation to LLVM IR"); + if (failed(checkImplementationStatus(*tgOp))) + return failure(); auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { builder.restoreIP(codegenIP); @@ -1290,8 +1537,8 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder, moduleTranslation.getOpenMPBuilder()->createTaskgroup(ompLoc, allocaIP, bodyCB); - if (!afterIP) - return tgOp.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, *tgOp))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -1300,9 +1547,8 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder, static LogicalResult convertOmpTaskwaitOp(omp::TaskwaitOp twOp, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { - if (!twOp.getDependVars().empty() || twOp.getDependKinds() || - twOp.getNowait()) - return twOp.emitError("unhandled clauses for translation to LLVM IR"); + if (failed(checkImplementationStatus(*twOp))) + return failure(); moduleTranslation.getOpenMPBuilder()->createTaskwait(builder.saveIP()); return success(); @@ -1313,10 +1559,8 @@ static LogicalResult convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { auto wsloopOp = cast(opInst); - if (!wsloopOp.getAllocateVars().empty() || - !wsloopOp.getAllocatorVars().empty() || - !wsloopOp.getPrivateVars().empty() || wsloopOp.getPrivateSyms()) - return opInst.emitError("unhandled clauses for translation to LLVM IR"); + if (failed(checkImplementationStatus(opInst))) + return failure(); auto loopOp = cast(wsloopOp.getWrappedLoop()); @@ -1418,15 +1662,15 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, computeIP = loopInfos.front()->getPreheaderIP(); } - llvm::Expected result = + llvm::Expected loopResult = ompBuilder->createCanonicalLoop( loc, bodyGen, lowerBound, upperBound, step, /*IsSigned=*/true, loopOp.getLoopInclusive(), computeIP); - if (!result) - return loopOp.emitError(llvm::toString(result.takeError())); + if (failed(handleError(loopResult, *loopOp))) + return failure(); - loopInfos.push_back(*result); + loopInfos.push_back(*loopResult); } // Collapse loops. Store the insertion point because LoopInfos may get @@ -1449,8 +1693,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, scheduleMod == omp::ScheduleModifier::monotonic, scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered); - if (!wsloopIP) - return opInst.emitError(llvm::toString(wsloopIP.takeError())); + if (failed(handleError(wsloopIP, opInst))) + return failure(); // Continue building IR after the loop. Note that the LoopInfo returned by // `collapseLoops` points inside the outermost loop and is intended for @@ -1473,6 +1717,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, assert(isByRef.size() == opInst.getNumReductionVars()); llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + if (failed(checkImplementationStatus(*opInst))) + return failure(); + // Collect delayed privatization declarations MutableArrayRef privateBlockArgs = cast(*opInst).getPrivateBlockArgs(); @@ -1538,8 +1785,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc", builder, moduleTranslation, &phis))) return llvm::createStringError( - "failed to inline `alloc` region of an `omp.private` op in the " - "parallel region"); + "failed to inline `alloc` region of `omp.private`"); assert(phis.size() == 1 && "expected one allocation to be yielded"); @@ -1566,7 +1812,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, opInst, reductionArgs, builder, moduleTranslation, allocaIP, reductionDecls, privateReductionVariables, reductionVariableMap, deferredStores, isByRef))) - return llvm::createStringError("failed reduction vars allocation"); + return llvm::make_error(); // Apply copy region for firstprivate. bool needsFirstprivate = @@ -1607,8 +1853,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy", builder, moduleTranslation))) return llvm::createStringError( - "failed to inline `copy` region of an `omp.private` op in the " - "parallel region"); + "failed to inline `copy` region of `omp.private`"); // ignore unused value yielded from copy region @@ -1658,8 +1903,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, reductionDecls[i].getInitializerRegion(), "omp.reduction.neutral", builder, moduleTranslation, &phis))) return llvm::createStringError( - "failed to inline `init` region of an `omp.declare_reduction` op " - "in the parallel region"); + "failed to inline `init` region of `omp.declare_reduction`"); assert(phis.size() == 1 && "expected one value to be yielded from the " "reduction neutral element declaration region"); @@ -1732,7 +1976,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, return contInsertPoint.takeError(); if (!contInsertPoint->getBlock()) - return llvm::createStringError("failed to convert reductions"); + return llvm::make_error(); tempTerminator->eraseFromParent(); builder.restoreIP(*contInsertPoint); @@ -1765,8 +2009,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, reductionCleanupRegions, privateReductionVariables, moduleTranslation, builder, "omp.reduction.cleanup"))) return llvm::createStringError( - "failed to inline `cleanup` region of an `omp.declare_reduction` op " - "in the parallel region"); + "failed to inline `cleanup` region of `omp.declare_reduction`"); SmallVector privateCleanupRegions; llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions), @@ -1777,8 +2020,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, if (failed(inlineOmpRegionCleanup( privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder, "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false))) - return llvm::createStringError("failed to inline `dealloc` region of an " - "`omp.private` op in the parallel region"); + return llvm::createStringError( + "failed to inline `dealloc` region of `omp.private`"); builder.restoreIP(oldIP); return llvm::Error::success(); @@ -1803,8 +2046,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB, ifCond, numThreads, pbKind, isCancellable); - if (!afterIP) - return opInst.emitError(llvm::toString(afterIP.takeError())); + + if (failed(handleError(afterIP, *opInst))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -1822,20 +2066,6 @@ convertOrderKind(std::optional o) { llvm_unreachable("Unknown ClauseOrderKind kind"); } -static LogicalResult simdOpSupported(omp::SimdOp op) { - if (!op.getLinearVars().empty() || !op.getLinearStepVars().empty()) - return op.emitError("linear clause not yet supported"); - - if (!op.getPrivateVars().empty() || op.getPrivateSyms()) - return op.emitError("privatization clauses not yet supported"); - - if (!op.getReductionVars().empty() || op.getReductionByref() || - op.getReductionSyms()) - return op.emitError("reduction clause not yet supported"); - - return success(); -} - /// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, @@ -1843,7 +2073,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, auto simdOp = cast(opInst); auto loopOp = cast(simdOp.getWrappedLoop()); - if (failed(simdOpSupported(simdOp))) + if (failed(checkImplementationStatus(opInst))) return failure(); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); @@ -1896,15 +2126,15 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, computeIP = loopInfos.front()->getPreheaderIP(); } - llvm::Expected result = + llvm::Expected loopResult = ompBuilder->createCanonicalLoop( loc, bodyGen, lowerBound, upperBound, step, /*IsSigned=*/true, /*InclusiveStop=*/true, computeIP); - if (!result) - return loopOp->emitError(llvm::toString(result.takeError())); + if (failed(handleError(loopResult, *loopOp))) + return failure(); - loopInfos.push_back(*result); + loopInfos.push_back(*loopResult); } // Collapse loops. @@ -1957,8 +2187,10 @@ convertAtomicOrdering(std::optional ao) { static LogicalResult convertOmpAtomicRead(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { - auto readOp = cast(opInst); + if (failed(checkImplementationStatus(opInst))) + return failure(); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); @@ -1981,6 +2213,9 @@ static LogicalResult convertOmpAtomicWrite(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { auto writeOp = cast(opInst); + if (failed(checkImplementationStatus(opInst))) + return failure(); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); @@ -2016,6 +2251,8 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + if (failed(checkImplementationStatus(*opInst))) + return failure(); // Convert values and types. auto &innerOpList = opInst.getRegion().front().getOperations(); @@ -2063,8 +2300,7 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst, moduleTranslation.mapValue(*opInst.getRegion().args_begin(), atomicx); moduleTranslation.mapBlock(&bb, builder.GetInsertBlock()); if (failed(moduleTranslation.convertBlock(bb, true, builder))) - return llvm::createStringError( - "unable to convert update operation to llvm IR"); + return llvm::make_error(); omp::YieldOp yieldop = dyn_cast(bb.getTerminator()); assert(yieldop && yieldop.getResults().size() == 1 && @@ -2081,8 +2317,8 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst, atomicOrdering, binop, updateFn, isXBinopExpr); - if (!afterIP) - return opInst.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, *opInst))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -2093,6 +2329,9 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + if (failed(checkImplementationStatus(*atomicCaptureOp))) + return failure(); + mlir::Value mlirExpr; bool isXBinopExpr = false, isPostfixUpdate = false; llvm::AtomicRMWInst::BinOp binop = llvm::AtomicRMWInst::BinOp::BAD_BINOP; @@ -2156,8 +2395,7 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp, atomicx); moduleTranslation.mapBlock(&bb, builder.GetInsertBlock()); if (failed(moduleTranslation.convertBlock(bb, true, builder))) - return llvm::createStringError( - "unable to convert update operation to llvm IR"); + return llvm::make_error(); omp::YieldOp yieldop = dyn_cast(bb.getTerminator()); assert(yieldop && yieldop.getResults().size() == 1 && @@ -2174,8 +2412,8 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp, ompLoc, allocaIP, llvmAtomicX, llvmAtomicV, llvmExpr, atomicOrdering, binop, updateFn, atomicUpdateOp, isPostfixUpdate, isXBinopExpr); - if (!afterIP) - return atomicCaptureOp.emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, *atomicCaptureOp))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -2189,6 +2427,9 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); auto threadprivateOp = cast(opInst); + if (failed(checkImplementationStatus(opInst))) + return failure(); + Value symAddr = threadprivateOp.getSymAddr(); auto *symOp = symAddr.getDefiningOp(); if (!isa(symOp)) @@ -3036,6 +3277,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, LogicalResult result = llvm::TypeSwitch(op) .Case([&](omp::TargetDataOp dataOp) { + if (failed(checkImplementationStatus(*dataOp))) + return failure(); + if (auto ifVar = dataOp.getIfExpr()) ifCond = moduleTranslation.lookupValue(ifVar); @@ -3050,10 +3294,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, useDeviceAddrVars = dataOp.getUseDeviceAddrVars(); return success(); }) - .Case([&](omp::TargetEnterDataOp enterDataOp) { - if (!enterDataOp.getDependVars().empty()) - return (LogicalResult)(enterDataOp.emitError( - "`depend` is not supported yet")); + .Case([&](omp::TargetEnterDataOp enterDataOp) -> LogicalResult { + if (failed(checkImplementationStatus(*enterDataOp))) + return failure(); if (auto ifVar = enterDataOp.getIfExpr()) ifCond = moduleTranslation.lookupValue(ifVar); @@ -3071,10 +3314,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, info.HasNoWait = enterDataOp.getNowait(); return success(); }) - .Case([&](omp::TargetExitDataOp exitDataOp) { - if (!exitDataOp.getDependVars().empty()) - return (LogicalResult)(exitDataOp.emitError( - "`depend` is not supported yet")); + .Case([&](omp::TargetExitDataOp exitDataOp) -> LogicalResult { + if (failed(checkImplementationStatus(*exitDataOp))) + return failure(); if (auto ifVar = exitDataOp.getIfExpr()) ifCond = moduleTranslation.lookupValue(ifVar); @@ -3092,10 +3334,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, info.HasNoWait = exitDataOp.getNowait(); return success(); }) - .Case([&](omp::TargetUpdateOp updateDataOp) { - if (!updateDataOp.getDependVars().empty()) - return (LogicalResult)(updateDataOp.emitError( - "`depend` is not supported yet")); + .Case([&](omp::TargetUpdateOp updateDataOp) -> LogicalResult { + if (failed(checkImplementationStatus(*updateDataOp))) + return failure(); if (auto ifVar = updateDataOp.getIfExpr()) ifCond = moduleTranslation.lookupValue(ifVar); @@ -3115,8 +3356,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, return success(); }) .Default([&](Operation *op) { - return op->emitError("unsupported OpenMP operation: ") - << op->getName(); + llvm_unreachable("unexpected operation"); + return failure(); }); if (failed(result)) @@ -3192,8 +3433,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, if (failed(inlineConvertOmpRegions(region, "omp.data.region", builder, moduleTranslation))) - return llvm::createStringError( - "failed to inline region of an `omp.target_data` op"); + return llvm::make_error(); } break; case BodyGenTy::DupNoPriv: @@ -3215,8 +3455,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, if (failed(inlineConvertOmpRegions(region, "omp.data.region", builder, moduleTranslation))) - return llvm::createStringError( - "failed to inline region of an `omp.target_data` op"); + return llvm::make_error(); } break; } @@ -3236,8 +3475,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, info, genMapInfoCB, &RTLFn); }(); - if (!afterIP) - return op->emitError(llvm::toString(afterIP.takeError())); + if (failed(handleError(afterIP, *op))) + return failure(); builder.restoreIP(*afterIP); return success(); @@ -3303,38 +3542,6 @@ static bool getTargetEntryUniqueInfo(llvm::TargetRegionEntryInfo &targetInfo, return true; } -static bool targetOpSupported(Operation &opInst) { - auto targetOp = cast(opInst); - if (targetOp.getIfExpr()) { - opInst.emitError("If clause not yet supported"); - return false; - } - - if (targetOp.getDevice()) { - opInst.emitError("Device clause not yet supported"); - return false; - } - - if (targetOp.getThreadLimit()) { - opInst.emitError("Thread limit clause not yet supported"); - return false; - } - - if (!targetOp.getAllocateVars().empty() || - !targetOp.getAllocatorVars().empty()) { - opInst.emitError("Allocate clause not yet supported"); - return false; - } - - if (!targetOp.getInReductionVars().empty() || - targetOp.getInReductionByref() || targetOp.getInReductionSyms()) { - opInst.emitError("In reduction clause not yet supported"); - return false; - } - - return true; -} - static void handleDeclareTargetMapVar(MapInfoData &mapData, LLVM::ModuleTranslation &moduleTranslation, @@ -3484,14 +3691,13 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg, static LogicalResult convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { - - if (!targetOpSupported(opInst)) + auto targetOp = cast(opInst); + if (failed(checkImplementationStatus(opInst))) return failure(); llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); bool isTargetDevice = ompBuilder->Config.isTargetDevice(); auto parentFn = opInst.getParentOfType(); - auto targetOp = cast(opInst); auto &targetRegion = targetOp.getRegion(); DataLayout dl = DataLayout(opInst.getParentOfType()); SmallVector mapVars = targetOp.getMapVars(); @@ -3546,14 +3752,10 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, SymbolRefAttr privSym = cast(privatizerNameAttr); omp::PrivateClauseOp privatizer = findPrivatizer(&opInst, privSym); - if (privatizer.getDataSharingType() == - omp::DataSharingClauseType::FirstPrivate || - !privatizer.getDeallocRegion().empty()) { - return llvm::createStringError( - "Translation of omp.target from MLIR to LLVMIR " - "failed because translation of firstprivate and " - " private allocatables is not supported yet"); - } + assert(privatizer.getDataSharingType() != + omp::DataSharingClauseType::FirstPrivate && + privatizer.getDeallocRegion().empty() && + "unsupported privatizer"); moduleTranslation.mapValue(privatizer.getAllocMoldArg(), moduleTranslation.lookupValue(privVar)); Region &allocRegion = privatizer.getAllocRegion(); @@ -3562,8 +3764,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, allocRegion, "omp.targetop.privatizer", builder, moduleTranslation, &yieldedValues))) { return llvm::createStringError( - "failed to inline `alloc` region of an `omp.private` " - "op in the target region"); + "failed to inline `alloc` region of `omp.private`"); } assert(yieldedValues.size() == 1); moduleTranslation.mapValue(privBlockArg, yieldedValues.front()); @@ -3641,16 +3842,16 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, buildDependData(targetOp.getDependKinds(), targetOp.getDependVars(), moduleTranslation, dds); - llvm::OpenMPIRBuilder::InsertPointOrErrorTy result = + llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = moduleTranslation.getOpenMPBuilder()->createTarget( ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), entryInfo, defaultValTeams, defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB, dds, targetOp.getNowait()); - if (!result) - return opInst.emitError(llvm::toString(result.takeError())); + if (failed(handleError(afterIP, opInst))) + return failure(); - builder.restoreIP(*result); + builder.restoreIP(*afterIP); // Remap access operations to declare target reference pointers for the // device, essentially generating extra loadop's as necessary @@ -3779,20 +3980,26 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); return llvm::TypeSwitch(op) - .Case([&](omp::BarrierOp) -> LogicalResult { - llvm::OpenMPIRBuilder::InsertPointOrErrorTy result = + .Case([&](omp::BarrierOp op) -> LogicalResult { + if (failed(checkImplementationStatus(*op))) + return failure(); + + llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier); - if (!result) - return op->emitError(llvm::toString(result.takeError())); - - return success(); + return handleError(afterIP, *op); }) - .Case([&](omp::TaskyieldOp) { + .Case([&](omp::TaskyieldOp op) { + if (failed(checkImplementationStatus(*op))) + return failure(); + ompBuilder->createTaskyield(builder.saveIP()); return success(); }) - .Case([&](omp::FlushOp) { + .Case([&](omp::FlushOp op) { + if (failed(checkImplementationStatus(*op))) + return failure(); + // No support in Openmp runtime function (__kmpc_flush) to accept // the argument list. // OpenMP standard states the following: diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt index eb588640dbf83..72eb34f36cf5f 100644 --- a/mlir/lib/Transforms/Utils/CMakeLists.txt +++ b/mlir/lib/Transforms/Utils/CMakeLists.txt @@ -10,6 +10,7 @@ add_mlir_library(MLIRTransformUtils LoopInvariantCodeMotionUtils.cpp OneToNTypeConversion.cpp RegionUtils.cpp + WalkPatternRewriteDriver.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms diff --git a/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp new file mode 100644 index 0000000000000..ee5c642c943c4 --- /dev/null +++ b/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp @@ -0,0 +1,116 @@ +//===- WalkPatternRewriteDriver.cpp - A fast walk-based rewriter ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements mlir::walkAndApplyPatterns. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Transforms/WalkPatternRewriteDriver.h" + +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Verifier.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Rewrite/PatternApplicator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" + +#define DEBUG_TYPE "walk-rewriter" + +namespace mlir { + +namespace { +struct WalkAndApplyPatternsAction final + : tracing::ActionImpl { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WalkAndApplyPatternsAction) + using ActionImpl::ActionImpl; + static constexpr StringLiteral tag = "walk-and-apply-patterns"; + void print(raw_ostream &os) const override { os << tag; } +}; + +#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS +// Forwarding listener to guard against unsupported erasures of non-descendant +// ops/blocks. Because we use walk-based pattern application, erasing the +// op/block from the *next* iteration (e.g., a user of the visited op) is not +// valid. Note that this is only used with expensive pattern API checks. +struct ErasedOpsListener final : RewriterBase::ForwardingListener { + using RewriterBase::ForwardingListener::ForwardingListener; + + void notifyOperationErased(Operation *op) override { + checkErasure(op); + ForwardingListener::notifyOperationErased(op); + } + + void notifyBlockErased(Block *block) override { + checkErasure(block->getParentOp()); + ForwardingListener::notifyBlockErased(block); + } + + void checkErasure(Operation *op) const { + Operation *ancestorOp = op; + while (ancestorOp && ancestorOp != visitedOp) + ancestorOp = ancestorOp->getParentOp(); + + if (ancestorOp != visitedOp) + llvm::report_fatal_error( + "unsupported erasure in WalkPatternRewriter; " + "erasure is only supported for matched ops and their descendants"); + } + + Operation *visitedOp = nullptr; +}; +#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS +} // namespace + +void walkAndApplyPatterns(Operation *op, + const FrozenRewritePatternSet &patterns, + RewriterBase::Listener *listener) { +#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + if (failed(verify(op))) + llvm::report_fatal_error("walk pattern rewriter input IR failed to verify"); +#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + + MLIRContext *ctx = op->getContext(); + PatternRewriter rewriter(ctx); +#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + ErasedOpsListener erasedListener(listener); + rewriter.setListener(&erasedListener); +#else + rewriter.setListener(listener); +#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + + PatternApplicator applicator(patterns); + applicator.applyDefaultCostModel(); + + ctx->executeAction( + [&] { + for (Region ®ion : op->getRegions()) { + region.walk([&](Operation *visitedOp) { + LLVM_DEBUG(llvm::dbgs() << "Visiting op: "; visitedOp->print( + llvm::dbgs(), OpPrintingFlags().skipRegions()); + llvm::dbgs() << "\n";); +#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + erasedListener.visitedOp = visitedOp; +#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + if (succeeded(applicator.matchAndRewrite(visitedOp, rewriter))) { + LLVM_DEBUG(llvm::dbgs() << "\tOp matched and rewritten\n";); + } + }); + } + }, + {op}); + +#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + if (failed(verify(op))) + llvm::report_fatal_error( + "walk pattern rewriter result IR failed to verify"); +#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS +} + +} // namespace mlir diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt index d1b5418cca5b2..eeaac27461b11 100644 --- a/mlir/python/requirements.txt +++ b/mlir/python/requirements.txt @@ -1,4 +1,4 @@ -numpy>=1.19.5, <=1.26 -pybind11>=2.9.0, <=2.10.3 +numpy>=1.19.5, <=2.1.2 +pybind11>=2.9.0, <=2.13.6 PyYAML>=5.4.0, <=6.0.1 -ml_dtypes>=0.1.0, <=0.4.0 # provides several NumPy dtype extensions, including the bf16 +ml_dtypes>=0.1.0, <=0.5.0 # provides several NumPy dtype extensions, including the bf16 diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index 7fd5610a88913..a9ea44925e914 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -163,6 +163,17 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: mem func.return } +// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16 +func.func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16(%value: vector<2xbf16>, %buf: memref<64xbf16>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32) + // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xbf16> + amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xbf16> -> memref<64xbf16>, i32 + func.return +} + // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32 func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir index d6db7922859a5..de53f964231cc 100644 --- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir +++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir @@ -1,32 +1,38 @@ // RUN: mlir-opt -convert-memref-to-emitc %s -split-input-file | FileCheck %s +// CHECK-LABEL: alloca() +func.func @alloca() { + // CHECK-NEXT: %[[ALLOCA:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.array<2xf32> + %0 = memref.alloca() : memref<2xf32> + return +} + +// ----- + // CHECK-LABEL: memref_store -// CHECK-SAME: %[[v:.*]]: f32, %[[argi:.*]]: index, %[[argj:.*]]: index -func.func @memref_store(%v : f32, %i: index, %j: index) { - // CHECK: %[[j:.*]] = builtin.unrealized_conversion_cast %[[argj]] : index to !emitc.size_t - // CHECK: %[[i:.*]] = builtin.unrealized_conversion_cast %[[argi]] : index to !emitc.size_t - // CHECK-NEXT: %[[ALLOCA:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.array<4x8xf32> - %0 = memref.alloca() : memref<4x8xf32> - - // CHECK-NEXT: %[[SUBSCRIPT:.*]] = emitc.subscript %[[ALLOCA]][%[[i]], %[[j]]] : (!emitc.array<4x8xf32>, !emitc.size_t, !emitc.size_t) -> !emitc.lvalue +// CHECK-SAME: %[[buff:.*]]: memref<4x8xf32>, %[[v:.*]]: f32, %[[argi:.*]]: index, %[[argj:.*]]: index +func.func @memref_store(%buff : memref<4x8xf32>, %v : f32, %i: index, %j: index) { + // CHECK-NEXT: %[[j:.*]] = builtin.unrealized_conversion_cast %[[argj]] : index to !emitc.size_t + // CHECK-NEXT: %[[i:.*]] = builtin.unrealized_conversion_cast %[[argi]] : index to !emitc.size_t + // CHECK-NEXT: %[[BUFFER:.*]] = builtin.unrealized_conversion_cast %[[buff]] : memref<4x8xf32> to !emitc.array<4x8xf32> + + // CHECK-NEXT: %[[SUBSCRIPT:.*]] = emitc.subscript %[[BUFFER]][%[[i]], %[[j]]] : (!emitc.array<4x8xf32>, !emitc.size_t, !emitc.size_t) -> !emitc.lvalue // CHECK-NEXT: emitc.assign %[[v]] : f32 to %[[SUBSCRIPT]] : - memref.store %v, %0[%i, %j] : memref<4x8xf32> + memref.store %v, %buff[%i, %j] : memref<4x8xf32> return } // ----- // CHECK-LABEL: memref_load -// CHECK-SAME: %[[argi:.*]]: index, %[[argj:.*]]: index -func.func @memref_load(%i: index, %j: index) -> f32 { - // CHECK: %[[j:.*]] = builtin.unrealized_conversion_cast %[[argj]] : index to !emitc.size_t - // CHECK: %[[i:.*]] = builtin.unrealized_conversion_cast %[[argi]] : index to !emitc.size_t - // CHECK-NEXT: %[[ALLOCA:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.array<4x8xf32> - %0 = memref.alloca() : memref<4x8xf32> - - // CHECK-NEXT: %[[SUBSCRIPT:.*]] = emitc.subscript %[[ALLOCA]][%[[i]], %[[j]]] : (!emitc.array<4x8xf32>, !emitc.size_t, !emitc.size_t) -> !emitc.lvalue +// CHECK-SAME: %[[buff:.*]]: memref<4x8xf32>, %[[argi:.*]]: index, %[[argj:.*]]: index +func.func @memref_load(%buff : memref<4x8xf32>, %i: index, %j: index) -> f32 { + // CHECK-NEXT: %[[j:.*]] = builtin.unrealized_conversion_cast %[[argj]] : index to !emitc.size_t + // CHECK-NEXT: %[[i:.*]] = builtin.unrealized_conversion_cast %[[argi]] : index to !emitc.size_t + // CHECK-NEXT: %[[BUFFER:.*]] = builtin.unrealized_conversion_cast %[[buff]] : memref<4x8xf32> to !emitc.array<4x8xf32> + // CHECK-NEXT: %[[SUBSCRIPT:.*]] = emitc.subscript %[[BUFFER]][%[[i]], %[[j]]] : (!emitc.array<4x8xf32>, !emitc.size_t, !emitc.size_t) -> !emitc.lvalue // CHECK-NEXT: %[[LOAD:.*]] = emitc.load %[[SUBSCRIPT]] : - %1 = memref.load %0[%i, %j] : memref<4x8xf32> + %1 = memref.load %buff[%i, %j] : memref<4x8xf32> // CHECK-NEXT: return %[[LOAD]] : f32 return %1 : f32 } diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir index b78577275a52a..ea1b79cbd9507 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir @@ -36,3 +36,12 @@ func.func @rfft2d_with_non_float_type(%arg0 : tensor<1x1x1xi32>) -> (tensor<1x1x %real, %imag = tosa.rfft2d %arg0 : (tensor<1x1x1xi32>) -> (tensor<1x1x1xi32>, tensor<1x1x1xi32>) return %real, %imag : tensor<1x1x1xi32>, tensor<1x1x1xi32> } + +// ----- + +// CHECK-LABEL: @rescale_unsupported_type +func.func @rescale_unsupported_type(%arg0: tensor<13x21x3x!quant.uniform>) -> tensor<13x21x3x!quant.uniform> { + // expected-error@+1 {{failed to legalize operation 'tosa.rescale'}} + %0 = tosa.rescale %arg0 {double_round = false, input_zp = 127 : i32, multiplier = array, output_zp = -1 : i32, per_channel = false, scale32 = true, shift = array} : (tensor<13x21x3x!quant.uniform>) -> tensor<13x21x3x!quant.uniform> + return %0 : tensor<13x21x3x!quant.uniform> +} diff --git a/mlir/test/Dialect/AMX/legalize-for-llvm.mlir b/mlir/test/Dialect/AMX/legalize-for-llvm.mlir index 992203153939f..3cacbd0044f82 100644 --- a/mlir/test/Dialect/AMX/legalize-for-llvm.mlir +++ b/mlir/test/Dialect/AMX/legalize-for-llvm.mlir @@ -43,3 +43,31 @@ func.func @mulf(%arg0: memref, %arg1: memref) { amx.tile_store %arg1[%0, %0], %4 : memref, vector<16x16xf32> return } + +// CHECK-LABEL: strides( +// CHECK: %[[CST_64_1:.+]] = llvm.mlir.constant(64 : i64) : i64 +// CHECK: "amx.tileloadd64"(%{{.+}}, %{{.+}}, %{{.+}}, %[[CST_64_1]] +// CHECK: %[[CST_128_1:.+]] = llvm.mlir.constant(128 : i64) : i64 +// CHECK: "amx.tileloadd64"(%{{.+}}, %{{.+}}, %{{.+}}, %[[CST_128_1]] +// CHECK: llvm.mlir.constant(2 : i64) : i64 +// CHECK: llvm.extractvalue %{{.+}}[4, 0] +// CHECK: %[[STRIDE_1:.+]] = llvm.mul +// CHECK: "amx.tileloadd64"(%{{.+}}, %{{.+}}, %{{.+}}, %[[STRIDE_1]] +// CHECK: %[[CST_64_2:.+]] = llvm.mlir.constant(64 : i64) : i64 +// CHECK: "amx.tilestored64"(%{{.+}}, %{{.+}}, %{{.+}}, %[[CST_64_2]] +// CHECK: %[[CST_128_2:.+]] = llvm.mlir.constant(128 : i64) : i64 +// CHECK: "amx.tilestored64"(%{{.+}}, %{{.+}}, %{{.+}}, %[[CST_128_2]] +// CHECK: llvm.mlir.constant(2 : i64) : i64 +// CHECK: llvm.extractvalue %{{.+}}[4, 0] +// CHECK: %[[STRIDE_2:.+]] = llvm.mul +// CHECK: "amx.tilestored64"(%{{.+}}, %{{.+}}, %{{.+}}, %[[STRIDE_2]] +func.func @strides(%arg0: memref<16x32xbf16>, %arg1: memref<16x32xbf16, strided<[64, 1]>>, %arg2: memref<16x32xbf16, strided<[?, 1]>>) { + %0 = arith.constant 0 : index + %1 = amx.tile_load %arg0[%0, %0] : memref<16x32xbf16> into vector<16x32xbf16> + %2 = amx.tile_load %arg1[%0, %0] : memref<16x32xbf16, strided<[64, 1]>> into vector<16x32xbf16> + %3 = amx.tile_load %arg2[%0, %0] : memref<16x32xbf16, strided<[?, 1]>> into vector<16x32xbf16> + amx.tile_store %arg0[%0, %0], %3 : memref<16x32xbf16>, vector<16x32xbf16> + amx.tile_store %arg1[%0, %0], %1 : memref<16x32xbf16, strided<[64, 1]>>, vector<16x32xbf16> + amx.tile_store %arg2[%0, %0], %2 : memref<16x32xbf16, strided<[?, 1]>>, vector<16x32xbf16> + return +} diff --git a/mlir/test/Dialect/DLTI/invalid.mlir b/mlir/test/Dialect/DLTI/invalid.mlir index 4b04f0195ef82..2436e4f7484f5 100644 --- a/mlir/test/Dialect/DLTI/invalid.mlir +++ b/mlir/test/Dialect/DLTI/invalid.mlir @@ -25,7 +25,12 @@ // ----- -// expected-error@below {{repeated layout entry key: test.id}} +// expected-error@below {{empty string as DLTI key is not allowed}} +"test.unknown_op"() { test.unknown_attr = #dlti.map<"" = 42> } : () -> () + +// ----- + +// expected-error@below {{repeated DLTI key: "test.id"}} "test.unknown_op"() { test.unknown_attr = #dlti.dl_spec< #dlti.dl_entry<"test.id", 42>, #dlti.dl_entry<"test.id", 43> @@ -33,7 +38,7 @@ // ----- -// expected-error@below {{repeated layout entry key: 'i32'}} +// expected-error@below {{repeated DLTI key: i32}} "test.unknown_op"() { test.unknown_attr = #dlti.map< #dlti.dl_entry, #dlti.dl_entry @@ -41,7 +46,7 @@ // ----- -// expected-error@below {{repeated layout entry key: 'i32'}} +// expected-error@below {{repeated DLTI key: i32}} "test.unknown_op"() { test.unknown_attr = #dlti.dl_spec< #dlti.dl_entry, #dlti.dl_entry @@ -111,9 +116,7 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"unknown.unknown // ----- -// expected-error@below {{expected string}} -// expected-error@below {{DeviceID is missing, or is not of string type}} -// expected-error@below {{failed to parse DLTI_TargetSystemSpecAttr parameter 'entries' which is to be a `::llvm::ArrayRef`}} +// expected-error@below {{invalid kind of attribute specified}} "test.unknown_op"() { dlti.target_system_spec = #dlti.target_system_spec<[]> } : () -> () // ----- @@ -121,11 +124,9 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"unknown.unknown module attributes { // Device ID is missing // - // expected-error@+4 {{expected string}} - // expected-error@+3 {{DeviceID is missing, or is not of string type}} - // expected-error@+2 {{failed to parse DLTI_TargetSystemSpecAttr parameter 'entries' which is to be a `::llvm::ArrayRef`}} + // expected-error@below {{expected attribute value}} dlti.target_system_spec = #dlti.target_system_spec< - : #dlti.target_device_spec< + = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>> >} {} @@ -134,11 +135,9 @@ module attributes { module attributes { // Device ID is wrong type // - // expected-error@+4 {{expected string}} - // expected-error@+3 {{DeviceID is missing, or is not of string type}} - // expected-error@+2 {{failed to parse DLTI_TargetSystemSpecAttr parameter 'entries' which is to be a `::llvm::ArrayRef`}} + // expected-error@+2 {{invalid kind of attribute specified}} dlti.target_system_spec = #dlti.target_system_spec< - 0: #dlti.target_device_spec< + 0 = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>> >} {} @@ -147,11 +146,11 @@ module attributes { module attributes { // Repeated Device ID // - // expected-error@below {{repeated Device ID in dlti.target_system_spec: "CPU"}} + // expected-error@+1 {{repeated device ID in dlti.target_system_spec: "CPU}} dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 4096>>, - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 8192>> >} {} @@ -160,11 +159,8 @@ module attributes { module attributes { // Repeated DLTI entry // - // expected-error@+4 {{repeated layout entry key: L1_cache_size_in_bytes}} - // expected-error@+6 {{Error in parsing target device spec}} - // expected-error@+5 {{failed to parse DLTI_TargetSystemSpecAttr parameter 'entries' which is to be a `::llvm::ArrayRef`}} + // expected-error@+2 {{repeated DLTI key: "L1_cache_size_in_bytes"}} dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< - #dlti.dl_entry<"L1_cache_size_in_bytes", 4096>, - #dlti.dl_entry<"L1_cache_size_in_bytes", 8192>> + "CPU" = #dlti.target_device_spec<"L1_cache_size_in_bytes" = 4096, + "L1_cache_size_in_bytes" = 8192> >} {} diff --git a/mlir/test/Dialect/DLTI/query.mlir b/mlir/test/Dialect/DLTI/query.mlir index a793c1a6e8e6a..3825cee6f1616 100644 --- a/mlir/test/Dialect/DLTI/query.mlir +++ b/mlir/test/Dialect/DLTI/query.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt -transform-interpreter -canonicalize -split-input-file -verify-diagnostics %s | FileCheck %s -// expected-remark @below {{associated attr 42 : i32}} -module attributes { test.dlti = #dlti.map<#dlti.dl_entry<"test.id", 42 : i32>>} { +// expected-remark @below {{attr associated to "test.id" = 42 : i32}} +module attributes { test.dlti = #dlti.map<"test.id" = 42 : i32> } { func.func private @f() } @@ -10,7 +10,7 @@ module attributes {transform.with_named_sequence} { %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op %module = transform.get_parent_op %funcs : (!transform.any_op) -> !transform.any_op %param = transform.dlti.query ["test.id"] at %module : (!transform.any_op) -> !transform.any_param - transform.debug.emit_param_as_remark %param, "associated attr" at %module : !transform.any_param, !transform.any_op + transform.debug.emit_param_as_remark %param, "attr associated to \"test.id\" =" at %module : !transform.any_param, !transform.any_op transform.yield } } @@ -18,7 +18,7 @@ module attributes {transform.with_named_sequence} { // ----- // expected-remark @below {{i32 present in set : unit}} -module attributes { test.dlti = #dlti.map<#dlti.dl_entry>} { +module attributes { test.dlti = #dlti.map } { func.func private @f() } @@ -34,8 +34,8 @@ module attributes {transform.with_named_sequence} { // ----- -// expected-remark @below {{associated attr 32 : i32}} -module attributes { test.dlti = #dlti.map<#dlti.dl_entry>>>} { +// expected-remark @below {{attr associated to i32's "width_in_bits" = 32 : i32}} +module attributes { test.dlti = #dlti.map> } { func.func private @f() } @@ -44,7 +44,7 @@ module attributes {transform.with_named_sequence} { %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op %module = transform.get_parent_op %funcs : (!transform.any_op) -> !transform.any_op %param = transform.dlti.query [i32,"width_in_bits"] at %module : (!transform.any_op) -> !transform.any_param - transform.debug.emit_param_as_remark %param, "associated attr" at %module : !transform.any_param, !transform.any_op + transform.debug.emit_param_as_remark %param, "attr associated to i32's \"width_in_bits\" =" at %module : !transform.any_param, !transform.any_op transform.yield } } @@ -53,7 +53,7 @@ module attributes {transform.with_named_sequence} { // expected-remark @below {{width in bits of i32 = 32 : i64}} // expected-remark @below {{width in bits of f64 = 64 : i64}} -module attributes { test.dlti = #dlti.map<#dlti.dl_entry<"width_in_bits", #dlti.map<#dlti.dl_entry, #dlti.dl_entry>>>} { +module attributes { test.dlti = #dlti.map<"width_in_bits" = #dlti.map> } { func.func private @f() } @@ -71,8 +71,8 @@ module attributes {transform.with_named_sequence} { // ----- -// expected-remark @below {{associated attr 42 : i32}} -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} { +// expected-remark @below {{attr associated to "test.id" = 42 : i32}} +module attributes { test.dlti = #dlti.dl_spec<"test.id" = 42 : i32> } { func.func private @f() } @@ -81,32 +81,32 @@ module attributes {transform.with_named_sequence} { %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op %module = transform.get_parent_op %funcs : (!transform.any_op) -> !transform.any_op %param = transform.dlti.query ["test.id"] at %module : (!transform.any_op) -> !transform.any_param - transform.debug.emit_param_as_remark %param, "associated attr" at %module : !transform.any_param, !transform.any_op + transform.debug.emit_param_as_remark %param, "attr associated to \"test.id\" =" at %module : !transform.any_param, !transform.any_op transform.yield } } // ----- -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} { - // expected-remark @below {{associated attr 24 : i32}} - func.func private @f() attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 24 : i32>>} +module attributes { test.dlti = #dlti.dl_spec<"test.id" = 42 : i32> } { + // expected-remark @below {{attr associated to "test.id" = 24 : i32}} + func.func private @f() attributes { test.dlti = #dlti.dl_spec<"test.id" = 24 : i32>} } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg: !transform.any_op) { %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op %param = transform.dlti.query ["test.id"] at %funcs : (!transform.any_op) -> !transform.any_param - transform.debug.emit_param_as_remark %param, "associated attr" at %funcs : !transform.any_param, !transform.any_op + transform.debug.emit_param_as_remark %param, "attr associated to \"test.id\" =" at %funcs : !transform.any_param, !transform.any_op transform.yield } } // ----- -// expected-remark @below {{associated attr 42 : i32}} -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} { - func.func private @f() attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 24 : i32>>} +// expected-remark @below {{attr associated to "test.id" = 42 : i32}} +module attributes { test.dlti = #dlti.dl_spec<"test.id" = 42 : i32> } { + func.func private @f() attributes { test.dlti = #dlti.dl_spec<"test.id" = 24 : i32> } } module attributes {transform.with_named_sequence} { @@ -114,14 +114,14 @@ module attributes {transform.with_named_sequence} { %funcs = transform.structured.match ops{["func.func"]} in %arg : (!transform.any_op) -> !transform.any_op %module = transform.get_parent_op %funcs : (!transform.any_op) -> !transform.any_op %param = transform.dlti.query ["test.id"] at %module : (!transform.any_op) -> !transform.any_param - transform.debug.emit_param_as_remark %param, "associated attr" at %module : !transform.any_param, !transform.any_op + transform.debug.emit_param_as_remark %param, "attr associated to \"test.id\" =" at %module : !transform.any_param, !transform.any_op transform.yield } } // ----- -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} { +module attributes { test.dlti = #dlti.dl_spec<"test.id" = 42 : i32> } { func.func @matmul_tensors( %arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { @@ -144,10 +144,10 @@ module attributes {transform.with_named_sequence} { // ----- -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} { +module attributes { test.dlti = #dlti.dl_spec<"test.id" = 42 : i32> } { func.func @matmul_tensors( %arg0: tensor, %arg1: tensor, %arg2: tensor) - -> tensor attributes {test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 24 : i32>>} { + -> tensor attributes {test.dlti = #dlti.dl_spec<"test.id" = 24 : i32> } { // expected-remark @below {{associated attr 24 : i32}} %0 = linalg.matmul ins(%arg0, %arg1: tensor, tensor) outs(%arg2: tensor) @@ -169,8 +169,8 @@ module attributes {transform.with_named_sequence} { // expected-remark @below {{associated attr 42 : i32}} module attributes { test.dlti = - #dlti.target_system_spec<"CPU": - #dlti.target_device_spec<#dlti.dl_entry<"test.id", 42 : i32>>>} { + #dlti.target_system_spec<"CPU" = + #dlti.target_device_spec<"test.id" = 42 : i32>> } { func.func private @f() } @@ -186,8 +186,8 @@ module attributes {transform.with_named_sequence} { // ----- -module attributes { test.dlti = #dlti.target_system_spec<"CPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 42 : i32>>, - "GPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 43 : i32>>>} { +module attributes { test.dlti = #dlti.target_system_spec<"CPU" = #dlti.target_device_spec<"test.id" = 42 : i32>, + "GPU" = #dlti.target_device_spec<"test.id" = 43 : i32>> } { // expected-remark @below {{associated attr 43 : i32}} func.func private @f() } @@ -203,10 +203,10 @@ module attributes {transform.with_named_sequence} { // ----- -module attributes { test.dlti = #dlti.target_system_spec<"CPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 42 : i32>>, - "GPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 43 : i32>>>} { +module attributes { test.dlti = #dlti.target_system_spec<"CPU" = #dlti.target_device_spec<"test.id" = 42 : i32>, + "GPU" = #dlti.target_device_spec<"test.id" = 43 : i32>> } { // expected-remark @below {{associated attr 24 : i32}} - func.func private @f() attributes { test.dlti = #dlti.target_system_spec<"CPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 24 : i32>>> } + func.func private @f() attributes { test.dlti = #dlti.target_system_spec<"CPU" = #dlti.target_device_spec<"test.id" = 24 : i32>> } } module attributes {transform.with_named_sequence} { @@ -221,9 +221,9 @@ module attributes {transform.with_named_sequence} { // ----- module attributes { test.dlti = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< - #dlti.dl_entry<"cache::L1::size_in_bytes", 65536 : i32>, - #dlti.dl_entry<"cache::L1d::size_in_bytes", 32768 : i32>>> } { + "CPU" = #dlti.target_device_spec< + "cache::L1::size_in_bytes" = 65536 : i32, + "cache::L1d::size_in_bytes" = 32768 : i32>> } { // expected-remark @below {{L1::size_in_bytes 65536 : i32}} // expected-remark @below {{L1d::size_in_bytes 32768 : i32}} func.func private @f() @@ -242,13 +242,13 @@ module attributes {transform.with_named_sequence} { // ----- -#l1_size = #dlti.map<#dlti.dl_entry<"size_in_bytes", 65536 : i32>> -#l1d_size = #dlti.map<#dlti.dl_entry<"size_in_bytes", 32768 : i32>> +#l1_size = #dlti.map<"size_in_bytes" = 65536 : i32> +#l1d_size = #dlti.map<"size_in_bytes" = 32768 : i32> module attributes { test.dlti = - #dlti.target_system_spec<"CPU": - #dlti.target_device_spec<#dlti.dl_entry<"cache", - #dlti.map<#dlti.dl_entry<"L1", #l1_size>, - #dlti.dl_entry<"L1d", #l1d_size> >>>> } { + #dlti.target_system_spec<"CPU" = + #dlti.target_device_spec<"cache" = + #dlti.map<"L1" = #l1_size, + "L1d" = #l1d_size >>> } { // expected-remark @below {{L1::size_in_bytes 65536 : i32}} // expected-remark @below {{L1d::size_in_bytes 32768 : i32}} func.func private @f() @@ -268,8 +268,7 @@ module attributes {transform.with_named_sequence} { // ----- module attributes { test.dlti = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< - #dlti.dl_entry<"inner_most_tile_size", 42 : i32>>>} { + "CPU" = #dlti.target_device_spec<"inner_most_tile_size" = 42 : i32>> } { // CHECK-LABEL: func @matmul_tensors func.func @matmul_tensors( %arg0: tensor, %arg1: tensor, %arg2: tensor) @@ -301,8 +300,8 @@ module attributes {transform.with_named_sequence} { // expected-note @below {{key "NPU" has no DLTI-mapping per attr: #dlti.target_system_spec}} module attributes { test.dlti = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 42 : i32>>, - "GPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 43 : i32>>>} { + "CPU" = #dlti.target_device_spec<"test.id" = 42 : i32>, + "GPU" = #dlti.target_device_spec<"test.id" = 43 : i32>> } { // expected-error @below {{target op of failed DLTI query}} func.func private @f() } @@ -320,8 +319,8 @@ module attributes {transform.with_named_sequence} { // expected-note @below {{key "unspecified" has no DLTI-mapping per attr: #dlti.target_device_spec}} module attributes { test.dlti = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 42 : i32>>, - "GPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 43 : i32>>>} { + "CPU" = #dlti.target_device_spec<"test.id" = 42 : i32>, + "GPU" = #dlti.target_device_spec<"test.id" = 43 : i32>> } { // expected-error @below {{target op of failed DLTI query}} func.func private @f() } @@ -339,8 +338,8 @@ module attributes {transform.with_named_sequence} { // expected-note @below {{key "test.id" has no DLTI-mapping per attr: #dlti.target_system_spec}} module attributes { test.dlti = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 42 : i32>>, - "GPU": #dlti.target_device_spec<#dlti.dl_entry<"test.id", 43 : i32>>>} { + "CPU" = #dlti.target_device_spec<"test.id" = 42 : i32>, + "GPU" = #dlti.target_device_spec<"test.id" = 43 : i32>> } { // expected-error @below {{target op of failed DLTI query}} func.func private @f() } @@ -357,7 +356,7 @@ module attributes {transform.with_named_sequence} { // ----- // expected-note @below {{key "CPU" has no DLTI-mapping per attr: #dlti.dl_spec}} -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} { +module attributes { test.dlti = #dlti.dl_spec<"test.id" = 42 : i32> } { // expected-error @below {{target op of failed DLTI query}} func.func private @f() } @@ -374,7 +373,7 @@ module attributes {transform.with_named_sequence} { // ----- // expected-note @below {{got non-DLTI-queryable attribute upon looking up keys ["CPU"]}} -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"CPU", 42 : i32>>} { +module attributes { test.dlti = #dlti.dl_spec<"CPU" = 42 : i32> } { // expected-error @below {{target op of failed DLTI query}} func.func private @f() } @@ -391,7 +390,7 @@ module attributes {transform.with_named_sequence} { // ----- // expected-note @below {{got non-DLTI-queryable attribute upon looking up keys [i32]}} -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry>} { +module attributes { test.dlti = #dlti.dl_spec } { // expected-error @below {{target op of failed DLTI query}} func.func private @f() } @@ -424,8 +423,8 @@ module attributes {transform.with_named_sequence} { // ----- -// expected-note @below {{key i64 has no DLTI-mapping per attr: #dlti.map<#dlti.dl_entry>}} -module attributes { test.dlti = #dlti.map<#dlti.dl_entry<"width_in_bits", #dlti.map<#dlti.dl_entry>>>} { +// expected-note @below {{key i64 has no DLTI-mapping per attr: #dlti.map}} +module attributes { test.dlti = #dlti.map<"width_in_bits" = #dlti.map>} { // expected-error @below {{target op of failed DLTI query}} func.func private @f() } @@ -441,7 +440,7 @@ module attributes {transform.with_named_sequence} { // ----- -module attributes { test.dlti = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>} { +module attributes { test.dlti = #dlti.dl_spec<"test.id" = 42 : i32>} { func.func private @f() } diff --git a/mlir/test/Dialect/DLTI/roundtrip.mlir b/mlir/test/Dialect/DLTI/roundtrip.mlir index 43188aad595a7..361c8bee78e1e 100644 --- a/mlir/test/Dialect/DLTI/roundtrip.mlir +++ b/mlir/test/Dialect/DLTI/roundtrip.mlir @@ -15,13 +15,21 @@ test.unknown_attr_4 = #dlti.dl_entry, ["string", 10]>, // CHECK: #dlti.dl_spec<> test.unknown_attr_5 = #dlti.dl_spec<>, - // CHECK: #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>> - test.unknown_attr_6 = #dlti.dl_spec<#dlti.dl_entry<"test.id", 42 : i32>>, + // CHECK: #dlti.dl_spec<"test.id" = 42 : i32> + test.unknown_attr_6 = #dlti.dl_spec<"test.id" = 42 : i32>, // CHECK: #dlti.dl_spec< - // CHECK: #dlti.dl_entry<"test.id1", 43 : index> - // CHECK: #dlti.dl_entry<"test.id2", 44 : index> - // CHECK: #dlti.dl_entry<"test.id3", 45 : index>> + // CHECK: "test.id1" = 43 : index, + // CHECK: "test.id2" = 44 : index, + // CHECK: "test.id3" = 45 : index> test.unknown_attr_7 = #dlti.dl_spec< + "test.id1" = 43 : index, + "test.id2" = 44 : index, + "test.id3" = 45 : index>, + // CHECK: #dlti.dl_spec< + // CHECK: "test.id1" = 43 : index, + // CHECK: "test.id2" = 44 : index, + // CHECK: "test.id3" = 45 : index> + test.unknown_attr_7_unsugared = #dlti.dl_spec< #dlti.dl_entry<"test.id1", 43 : index>, #dlti.dl_entry<"test.id2", 44 : index>, #dlti.dl_entry<"test.id3", 45 : index>> @@ -40,34 +48,34 @@ // Should not fail on nested compatible layouts. "test.op_with_data_layout"() ({ - "test.op_with_data_layout"() { dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"unknown.unknown", 32>> } : () -> () + "test.op_with_data_layout"() { dlti.dl_spec = #dlti.dl_spec<"unknown.unknown" = 32> } : () -> () "test.maybe_terminator_op"() : () -> () -}) { dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"unknown.unknown", 32>> } : () -> () +}) { dlti.dl_spec = #dlti.dl_spec<"unknown.unknown" = 32> } : () -> () // Should not fail on deeper nested compatible layouts. "test.op_with_data_layout"() ({ "test.op_with_data_layout"() ({ "test.op_with_data_layout"() - { dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"unknown.unknown", 32>> } : () -> () + { dlti.dl_spec = #dlti.dl_spec<"unknown.unknown" = 32> } : () -> () "test.maybe_terminator_op"() : () -> () - }) { dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"unknown.unknown", 32>> } : () -> () + }) { dlti.dl_spec = #dlti.dl_spec<"unknown.unknown" = 32> } : () -> () "test.maybe_terminator_op"() : () -> () -}) { dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"unknown.unknown", 32>> } : () -> () +}) { dlti.dl_spec = #dlti.dl_spec<"unknown.unknown" = 32> } : () -> () // A valid target system description // CHECK: module attributes { -// CHECK: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK: "CPU" : #dlti.target_device_spec< -// CHECK: #dlti.dl_entry<"dlti.L1_cache_size_in_bytes", 4096 : ui32>>, -// CHECK: "GPU" : #dlti.target_device_spec< -// CHECK: #dlti.dl_entry<"dlti.max_vector_op_width", 128 : ui32>> -// CHECK: >} { +// CHECK: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK: "CPU" = #dlti.target_device_spec< +// CHECK: "dlti.L1_cache_size_in_bytes" = 4096 : ui32>, +// CHECK: "GPU" = #dlti.target_device_spec< +// CHECK: "dlti.max_vector_op_width" = 128 : ui32> +// CHECK: >} { // CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< - #dlti.dl_entry<"dlti.L1_cache_size_in_bytes", 4096 : ui32>>, - "GPU": #dlti.target_device_spec< - #dlti.dl_entry<"dlti.max_vector_op_width", 128 : ui32>> + "CPU" = #dlti.target_device_spec< + "dlti.L1_cache_size_in_bytes" = 4096 : ui32>, + "GPU" = #dlti.target_device_spec< + "dlti.max_vector_op_width" = 128 : ui32> >} {} - + diff --git a/mlir/test/Dialect/DLTI/valid.mlir b/mlir/test/Dialect/DLTI/valid.mlir index 31c925e5cb5be..4c24e80041003 100644 --- a/mlir/test/Dialect/DLTI/valid.mlir +++ b/mlir/test/Dialect/DLTI/valid.mlir @@ -3,12 +3,28 @@ // CHECK: module attributes { // CHECK-SAME: dlti.map = #dlti.map< -// CHECK-SAME: #dlti.dl_entry<"magic_num", 42 : i32>, -// CHECK-SAME: #dlti.dl_entry<"magic_num_float", 4.242000e+01 : f32>, -// CHECK-SAME: #dlti.dl_entry<"magic_type", i32>, -// CHECK-SAME: #dlti.dl_entry>> -// CHECK-SAME: >} { +// CHECK-SAME: "magic_num" = 42 : i32, +// CHECK-SAME: "magic_num_float" = 4.242000e+01 : f32, +// CHECK-SAME: "magic_type" = i32, +// CHECK-SAME: i32 = #dlti.map<"bitwidth" = 32 : i32> +// CHECK: >} { +// CHECK: } +module attributes { + dlti.map = #dlti.map<"magic_num" = 42 : i32, + "magic_num_float" = 42.42 : f32, + "magic_type" = i32, + i32 = #dlti.map<"bitwidth" = 32 : i32>> + } {} + +// ----- + +// CHECK: module attributes { +// CHECK-SAME: dlti.map = #dlti.map< +// CHECK-SAME: "magic_num" = 42 : i32, +// CHECK-SAME: "magic_num_float" = 4.242000e+01 : f32, +// CHECK-SAME: "magic_type" = i32, +// CHECK-SAME: i32 = #dlti.map<"bitwidth" = 32 : i32> +// CHECK: >} { // CHECK: } module attributes { dlti.map = #dlti.map< @@ -21,13 +37,11 @@ module attributes { // ----- // CHECK: module attributes { -// CHECK-SAME: dlti.map = #dlti.map< -// CHECK-SAME: #dlti.dl_entry<"CPU", #dlti.map< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>>>, -// CHECK-SAME: #dlti.dl_entry<"GPU", #dlti.map< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 128 : i32>>> -// CHECK-SAME: >} { -// CHECK: } +// CHECK-SAME: dlti.map = #dlti.map< +// CHECK-SAME: "CPU" = #dlti.map<"L1_cache_size_in_bytes" = 4096 : i32>, +// CHECK-SAME: "GPU" = #dlti.map<"max_vector_op_width" = 128 : i32> +// CHECK-SAME: >} { +// CHECK: } module attributes { dlti.map = #dlti.map< #dlti.dl_entry<"CPU", #dlti.map< @@ -40,17 +54,17 @@ module attributes { // CHECK: module attributes { // CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 128 : i32>> +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = 4096 : i32>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 128 : i32> // CHECK-SAME: >} { -// CHECK: } +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", 128 : i32>> >} {} @@ -58,89 +72,89 @@ module attributes { // CHECK: module attributes { // CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", 8192 : i32>> +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = 4096 : i32>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = 8192 : i32> // CHECK-SAME: >} { -// CHECK: } +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i32>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 8192 : i32>> >} {} // ----- // CHECK: module attributes { -// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i64>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", 8192 : i64>> -// CHECK-SAME: >} { -// CHECK: } +// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = 4096 : i64>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = 8192 : i64> +// CHECK-SAME: >} { +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : i64>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 8192 : i64>> >} {} // ----- // CHECK: module attributes { -// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 64 : i32>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 128 : i32>> -// CHECK-SAME: >} { -// CHECK: } +// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 64 : i32>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 128 : i32> +// CHECK-SAME: >} { +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", 64 : i32>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", 128 : i32>> >} {} // ----- // CHECK: module attributes { -// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 64 : i64>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 128 : i64>> -// CHECK-SAME: >} { -// CHECK: } +// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 64 : i64>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 128 : i64> +// CHECK-SAME: >} { +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", 64 : i64>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", 128 : i64>> >} {} // ----- // CHECK: module attributes { -// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 64 : i64>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 128 : i64>> -// CHECK-SAME: >} { -// CHECK: } +// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 64 : i64>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 128 : i64> +// CHECK-SAME: >} { +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", 64 : i64>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", 128 : i64>> >} {} @@ -149,18 +163,18 @@ module attributes { // Check values of mixed type // // CHECK: module attributes { -// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : ui32>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", "128">> -// CHECK-SAME: >} { -// CHECK: } +// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = 4096 : ui32>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = "128"> +// CHECK-SAME: >} { +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", 4096 : ui32>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", "128">> >} {} @@ -169,18 +183,18 @@ module attributes { // Check values of mixed type // // CHECK: module attributes { -// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"max_vector_op_width", 4.096000e+03 : f32>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", "128">> -// CHECK-SAME: >} { -// CHECK: } +// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 4.096000e+03 : f32>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = "128"> +// CHECK-SAME: >} { +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"max_vector_op_width", 4096.0 : f32>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", "128">> >} {} @@ -190,34 +204,51 @@ module attributes { // Check values of mixed type // // CHECK: module attributes { -// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< -// CHECK-SAME: "CPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"vector_unit", #dlti.map< -// CHECK-SAME: #dlti.dl_entry<"max_op_width", 4.096000e+03 : f32>>>>, -// CHECK-SAME: "GPU" : #dlti.target_device_spec< -// CHECK-SAME: #dlti.dl_entry<"L1_cache_size_in_bytes", "128">> -// CHECK-SAME: >} { -// CHECK: } +// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "vector_unit" = #dlti.map< +// CHECK-SAME: "max_op_width" = 4.096000e+03 : f32>>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = "128"> +// CHECK-SAME: >} { +// CHECK: } module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< + "CPU" = #dlti.target_device_spec< #dlti.dl_entry<"vector_unit", #dlti.map< #dlti.dl_entry<"max_op_width", 4096.0 : f32>>>>, - "GPU": #dlti.target_device_spec< + "GPU" = #dlti.target_device_spec< #dlti.dl_entry<"L1_cache_size_in_bytes", "128">> >} {} +// ----- + +// Check values of mixed type +// +// CHECK: module attributes { +// CHECK-SAME: dlti.target_system_spec = #dlti.target_system_spec< +// CHECK-SAME: "CPU" = #dlti.target_device_spec< +// CHECK-SAME: "L1_cache_size_in_bytes" = 4096 : ui32>, +// CHECK-SAME: "GPU" = #dlti.target_device_spec< +// CHECK-SAME: "max_vector_op_width" = 128 : i64> +// CHECK-SAME: >} { +// CHECK: } +module attributes { + dlti.target_system_spec = #dlti.target_system_spec< + "CPU" = #dlti.target_device_spec<"L1_cache_size_in_bytes" = 4096 : ui32>, + "GPU" = #dlti.target_device_spec<"max_vector_op_width" = 128> + >} {} // ----- // CHECK: "test.op_with_dlti_map"() ({ -// CHECK: }) {dlti.map = #dlti.map<#dlti.dl_entry<"dlti.unknown_id", 42 : i64>>} +// CHECK: }) {dlti.map = #dlti.map<"dlti.unknown_id" = 42 : i64>} "test.op_with_dlti_map"() ({ }) { dlti.map = #dlti.map<#dlti.dl_entry<"dlti.unknown_id", 42>> } : () -> () // ----- // CHECK: "test.op_with_dlti_map"() ({ -// CHECK: }) {dlti.map = #dlti.map<#dlti.dl_entry>} +// CHECK: }) {dlti.map = #dlti.map} "test.op_with_dlti_map"() ({ }) { dlti.map = #dlti.map<#dlti.dl_entry> } : () -> () diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir index 7f44f11b47e06..6e682b26f6c95 100644 --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -36,7 +36,7 @@ func.func @launch() { return } -// CHECK-DL-LABEL: gpu.module @launch_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @launch_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // CHECK-LABEL: gpu.module @launch_kernel // CHECK-NEXT: gpu.func @launch_kernel // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref) @@ -123,7 +123,7 @@ llvm.func @launch_from_llvm_func() { llvm.return } -// CHECK-DL-LABEL: gpu.module @launch_from_llvm_func_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @launch_from_llvm_func_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // ----- @@ -169,8 +169,8 @@ func.func @multiple_launches() { return } -// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} -// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel_0 attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec} +// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel_0 attributes {dlti.dl_spec = #dlti.dl_spec} // CHECK: gpu.module @multiple_launches_kernel // CHECK: func @multiple_launches_kernel @@ -197,7 +197,7 @@ func.func @extra_constants_not_inlined(%arg0: memref) { return } -// CHECK-DL-LABEL: gpu.module @extra_constants_not_inlined_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @extra_constants_not_inlined_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref, %{{.*}}: index) // CHECK: arith.constant 2 @@ -223,7 +223,7 @@ func.func @extra_constants(%arg0: memref) { return } -// CHECK-DL-LABEL: gpu.module @extra_constants_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @extra_constants_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // CHECK-LABEL: func @extra_constants_kernel( // CHECK-SAME: %[[KARG0:.*]]: memref @@ -253,7 +253,7 @@ func.func @extra_constants_noarg(%arg0: memref, %arg1: memref) { return } -// CHECK-DL-LABEL: gpu.module @extra_constants_noarg_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @extra_constants_noarg_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // CHECK-LABEL: func @extra_constants_noarg_kernel( // CHECK-SAME: %[[KARG0:.*]]: memref, %[[KARG1:.*]]: index @@ -283,7 +283,7 @@ func.func @multiple_uses(%arg0 : memref) { return } -// CHECK-DL-LABEL: gpu.module @multiple_uses_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @multiple_uses_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // ----- @@ -312,7 +312,7 @@ func.func @multiple_uses2(%arg0 : memref<*xf32>) { return } -// CHECK-DL-LABEL: gpu.module @multiple_uses2_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @multiple_uses2_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // ----- @@ -343,7 +343,7 @@ func.func @recursive_device_function() { return } -// CHECK-DL-LABEL: gpu.module @function_call_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @function_call_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // CHECK: gpu.module @function_call_kernel { // CHECK: gpu.func @function_call_kernel() @@ -373,7 +373,7 @@ func.func @non_constant_launches(%arg0 : index) { return } -// CHECK-DL-LABEL: gpu.module @non_constant_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @non_constant_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // CHECK: module attributes {gpu.container_module} @@ -401,7 +401,7 @@ func.func @launch_memory_attributions_0() { return } -// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_0_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_0_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // CHECK-LABEL: gpu.module @launch_memory_attributions_0_kernel // CHECK-NEXT: gpu.func @launch_memory_attributions_0_kernel @@ -435,7 +435,7 @@ func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) { return } -// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>} +// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec} // ----- // CHECK: module attributes {gpu.container_module} diff --git a/mlir/test/Dialect/LLVMIR/debuginfo.mlir b/mlir/test/Dialect/LLVMIR/debuginfo.mlir index dafb3bcef740f..1147cb110199d 100644 --- a/mlir/test/Dialect/LLVMIR/debuginfo.mlir +++ b/mlir/test/Dialect/LLVMIR/debuginfo.mlir @@ -80,9 +80,17 @@ flags = "TypePassByReference|NonTrivial" > -// CHECK-DAG: #[[SPTYPE0:.*]] = #llvm.di_subroutine_type +// CHECK-DAG: #[[COMP3:.+]] = #llvm.di_composite_type<{{.*}}, name = "expr_elements2"{{.*}}elements = #llvm.di_generic_subrange, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>>> +#exp1 = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(16), DW_OP_deref]> +#exp2 = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]> +#exp3 = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]> +#comp3 = #llvm.di_composite_type> + +// CHECK-DAG: #[[SPTYPE0:.*]] = #llvm.di_subroutine_type #spType0 = #llvm.di_subroutine_type< - callingConvention = DW_CC_normal, types = #null, #int0, #ptr0, #ptr1, #ptr2, #comp0, #comp1, #comp2 + callingConvention = DW_CC_normal, types = #null, #int0, #ptr0, #ptr1, #ptr2, #comp0, #comp1, #comp2, #comp3 > // CHECK-DAG: #[[SPTYPE1:.*]] = #llvm.di_subroutine_type diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index 4a18aa6faf1d8..6f4ed802068aa 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -666,3 +666,13 @@ func.func @test_mul_invalid_shift(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1 %0 = tosa.mul %arg0, %arg1 {shift = 1 : i8} : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } + +// ----- + +// CHECK-LABEL: test_unsupported_int64_data_type +func.func @test_unsupported_int64_data_type(%arg0: tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64> { + // expected-error@+1 {{'tosa.argmax' op is not profile-aligned: element type 'i64' is not legal}} + %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64> + // expected-error@+1 {{'func.return' op is not profile-aligned: element type 'i64' is not legal}} + return %0 : tensor<1x13x13xi64> +} diff --git a/mlir/test/IR/enum-attr-roundtrip.mlir b/mlir/test/IR/enum-attr-roundtrip.mlir index 0b4d379cfb7d5..36e605bdbff4d 100644 --- a/mlir/test/IR/enum-attr-roundtrip.mlir +++ b/mlir/test/IR/enum-attr-roundtrip.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s | mlir-opt -test-patterns | FileCheck %s +// RUN: mlir-opt %s | mlir-opt -test-greedy-patterns | FileCheck %s // CHECK-LABEL: @test_enum_attr_roundtrip func.func @test_enum_attr_roundtrip() -> () { diff --git a/mlir/test/IR/greedy-pattern-rewrite-driver-bottom-up.mlir b/mlir/test/IR/greedy-pattern-rewrite-driver-bottom-up.mlir index f3da9a147fcb9..d619eefd72102 100644 --- a/mlir/test/IR/greedy-pattern-rewrite-driver-bottom-up.mlir +++ b/mlir/test/IR/greedy-pattern-rewrite-driver-bottom-up.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -test-patterns="max-iterations=1" \ +// RUN: mlir-opt %s -test-greedy-patterns="max-iterations=1" \ // RUN: -allow-unregistered-dialect --split-input-file | FileCheck %s // CHECK-LABEL: func @add_to_worklist_after_inplace_update() diff --git a/mlir/test/IR/greedy-pattern-rewrite-driver-top-down.mlir b/mlir/test/IR/greedy-pattern-rewrite-driver-top-down.mlir index a362d6f99b947..9f4a7924b725a 100644 --- a/mlir/test/IR/greedy-pattern-rewrite-driver-top-down.mlir +++ b/mlir/test/IR/greedy-pattern-rewrite-driver-top-down.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -test-patterns="max-iterations=1 top-down=true" \ +// RUN: mlir-opt %s -test-greedy-patterns="max-iterations=1 top-down=true" \ // RUN: --split-input-file | FileCheck %s // Tests for https://github.com/llvm/llvm-project/issues/86765. Ensure diff --git a/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir b/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir new file mode 100644 index 0000000000000..02f7e60671c9b --- /dev/null +++ b/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir @@ -0,0 +1,121 @@ +// RUN: mlir-opt %s --test-walk-pattern-rewrite-driver="dump-notifications=true" \ +// RUN: --allow-unregistered-dialect --split-input-file | FileCheck %s + +// The following op is updated in-place and will not be added back to the worklist. +// CHECK-LABEL: func.func @inplace_update() +// CHECK: "test.any_attr_of_i32_str"() <{attr = 1 : i32}> : () -> () +// CHECK: "test.any_attr_of_i32_str"() <{attr = 2 : i32}> : () -> () +func.func @inplace_update() { + "test.any_attr_of_i32_str"() {attr = 0 : i32} : () -> () + "test.any_attr_of_i32_str"() {attr = 1 : i32} : () -> () + return +} + +// Check that the driver does not fold visited ops. +// CHECK-LABEL: func.func @add_no_fold() +// CHECK: arith.constant +// CHECK: arith.constant +// CHECK: %[[RES:.+]] = arith.addi +// CHECK: return %[[RES]] +func.func @add_no_fold() -> i32 { + %c0 = arith.constant 0 : i32 + %c1 = arith.constant 1 : i32 + %res = arith.addi %c0, %c1 : i32 + return %res : i32 +} + +// Check that the driver handles rewriter.moveBefore. +// CHECK-LABEL: func.func @move_before( +// CHECK: "test.move_before_parent_op" +// CHECK: "test.any_attr_of_i32_str"() <{attr = 1 : i32}> : () -> () +// CHECK: scf.if +// CHECK: return +func.func @move_before(%cond : i1) { + scf.if %cond { + "test.move_before_parent_op"() ({ + "test.any_attr_of_i32_str"() {attr = 0 : i32} : () -> () + }) : () -> () + } + return +} + +// Check that the driver handles rewriter.moveAfter. In this case, we expect +// the moved op to be visited only once since walk uses `make_early_inc_range`. +// CHECK-LABEL: func.func @move_after( +// CHECK: scf.if +// CHECK: } +// CHECK: "test.move_after_parent_op" +// CHECK: "test.any_attr_of_i32_str"() <{attr = 1 : i32}> : () -> () +// CHECK: return +func.func @move_after(%cond : i1) { + scf.if %cond { + "test.move_after_parent_op"() ({ + "test.any_attr_of_i32_str"() {attr = 0 : i32} : () -> () + }) : () -> () + } + return +} + +// Check that the driver handles rewriter.moveAfter. In this case, we expect +// the moved op to be visited twice since we advance its position to the next +// node after the parent. +// CHECK-LABEL: func.func @move_forward_and_revisit( +// CHECK: scf.if +// CHECK: } +// CHECK: arith.addi +// CHECK: "test.move_after_parent_op" +// CHECK: "test.any_attr_of_i32_str"() <{attr = 2 : i32}> : () -> () +// CHECK: arith.addi +// CHECK: return +func.func @move_forward_and_revisit(%cond : i1) { + scf.if %cond { + "test.move_after_parent_op"() ({ + "test.any_attr_of_i32_str"() {attr = 0 : i32} : () -> () + }) {advance = 1 : i32} : () -> () + } + %a = arith.addi %cond, %cond : i1 + %b = arith.addi %a, %cond : i1 + return +} + +// Operation inserted just after the currently visited one won't be visited. +// CHECK-LABEL: func.func @insert_just_after +// CHECK: "test.clone_me"() ({ +// CHECK: "test.any_attr_of_i32_str"() <{attr = 1 : i32}> : () -> () +// CHECK: }) {was_cloned} : () -> () +// CHECK: "test.clone_me"() ({ +// CHECK: "test.any_attr_of_i32_str"() <{attr = 1 : i32}> : () -> () +// CHECK: }) : () -> () +// CHECK: return +func.func @insert_just_after(%cond : i1) { + "test.clone_me"() ({ + "test.any_attr_of_i32_str"() {attr = 0 : i32} : () -> () + }) : () -> () + return +} + +// Check that we can replace the current operation with a new one. +// Note that the new op won't be visited. +// CHECK-LABEL: func.func @replace_with_new_op +// CHECK: %[[NEW:.+]] = "test.new_op" +// CHECK: %[[RES:.+]] = arith.addi %[[NEW]], %[[NEW]] +// CHECK: return %[[RES]] +func.func @replace_with_new_op() -> i32 { + %a = "test.replace_with_new_op"() : () -> (i32) + %res = arith.addi %a, %a : i32 + return %res : i32 +} + +// Check that we can erase nested blocks. +// CHECK-LABEL: func.func @erase_nested_block +// CHECK: %[[RES:.+]] = "test.erase_first_block" +// CHECK-NEXT: foo.bar +// CHECK: return %[[RES]] +func.func @erase_nested_block() -> i32 { + %a = "test.erase_first_block"() ({ + "foo.foo"() : () -> () + ^bb1: + "foo.bar"() : () -> () + }): () -> (i32) + return %a : i32 +} diff --git a/mlir/test/Target/LLVMIR/Import/data-layout.ll b/mlir/test/Target/LLVMIR/Import/data-layout.ll index ee6f4dd994f1d..c397053585e3c 100644 --- a/mlir/test/Target/LLVMIR/Import/data-layout.ll +++ b/mlir/test/Target/LLVMIR/Import/data-layout.ll @@ -4,16 +4,16 @@ ; CHECK: dlti.dl_spec = ; CHECK: #dlti.dl_spec< -; CHECK-DAG: #dlti.dl_entry<"dlti.endianness", "little"> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<4xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> +; CHECK-DAG: "dlti.endianness" = "little" +; CHECK-DAG: i1 = dense<8> : vector<2xi64> +; CHECK-DAG: i8 = dense<8> : vector<2xi64> +; CHECK-DAG: i16 = dense<16> : vector<2xi64> +; CHECK-DAG: i32 = dense<32> : vector<2xi64> +; CHECK-DAG: i64 = dense<[32, 64]> : vector<2xi64> +; CHECK-DAG: !llvm.ptr = dense<64> : vector<4xi64> +; CHECK-DAG: f16 = dense<16> : vector<2xi64> +; CHECK-DAG: f64 = dense<64> : vector<2xi64> +; CHECK-DAG: f128 = dense<128> : vector<2xi64> ; CHECK: > target datalayout = "" @@ -21,30 +21,30 @@ target datalayout = "" ; CHECK: dlti.dl_spec = ; CHECK: #dlti.dl_spec< -; CHECK-DAG: #dlti.dl_entry<"dlti.endianness", "little"> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> -; CHECK-DAG: #dlti.dl_entry, dense<[32, 64, 64, 32]> : vector<4xi64>> -; CHECK-DAG: #dlti.dl_entry, dense<32> : vector<4xi64>> -; CHECK-DAG: #dlti.dl_entry, dense<64> : vector<4xi64>> -; CHECK-DAG: #dlti.dl_entry<"dlti.stack_alignment", 128 : i64> +; CHECK-DAG: "dlti.endianness" = "little" +; CHECK-DAG: i64 = dense<64> : vector<2xi64> +; CHECK-DAG: f80 = dense<128> : vector<2xi64> +; CHECK-DAG: i8 = dense<8> : vector<2xi64> +; CHECK-DAG: !llvm.ptr<270> = dense<[32, 64, 64, 32]> : vector<4xi64> +; CHECK-DAG: !llvm.ptr<271> = dense<32> : vector<4xi64> +; CHECK-DAG: !llvm.ptr<272> = dense<64> : vector<4xi64> +; CHECK-DAG: "dlti.stack_alignment" = 128 : i64 target datalayout = "e-m:e-p270:32:64-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" ; // ----- ; CHECK: dlti.dl_spec = ; CHECK: #dlti.dl_spec< -; CHECK-DAG: #dlti.dl_entry<"dlti.endianness", "big"> -; CHECK-DAG: #dlti.dl_entry, dense<[16, 32, 64, 8]> : vector<4xi64>> -; CHECK-DAG: #dlti.dl_entry, dense<[16, 32, 64, 16]> : vector<4xi64>> -; CHECK-DAG: #dlti.dl_entry<"dlti.alloca_memory_space", 1 : ui64> -; CHECK-DAG: #dlti.dl_entry : vector<2xi64>> +; CHECK-DAG: "dlti.endianness" = "big" +; CHECK-DAG: !llvm.ptr<270> = dense<[16, 32, 64, 8]> : vector<4xi64> +; CHECK-DAG: !llvm.ptr<271> = dense<[16, 32, 64, 16]> : vector<4xi64> +; CHECK-DAG: "dlti.alloca_memory_space" = 1 : ui64 +; CHECK-DAG: i64 = dense<[64, 128]> : vector<2xi64> target datalayout = "A1-E-p270:16:32:64:8-p271:16:32:64-i64:64:128" ; // ----- ; CHECK: dlti.dl_spec = ; CHECK: #dlti.dl_spec< -; CHECK-NOT: #dlti.dl_entry<"dlti.alloca_memory_space" +; CHECK-NOT: "dlti.alloca_memory_space" = target datalayout = "A0" diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll index 9ef1942e0787c..9e2a17fb436af 100644 --- a/mlir/test/Target/LLVMIR/Import/debug-info.ll +++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll @@ -169,7 +169,8 @@ define void @derived_type() !dbg !3 { ; CHECK-DAG: #[[COMP4:.+]] = #llvm.di_composite_type<{{.*}}, flags = Vector, elements = #llvm.di_subrange> ; CHECK-DAG: #[[COMP5:.+]] = #llvm.di_composite_type<{{.*}}, name = "var_elements"{{.*}}elements = #llvm.di_subrange> ; CHECK-DAG: #[[COMP6:.+]] = #llvm.di_composite_type<{{.*}}, name = "expr_elements"{{.*}}elements = #llvm.di_subrange>> -; CHECK-DAG: #llvm.di_subroutine_type +; CHECK-DAG: #[[COMP7:.+]] = #llvm.di_composite_type<{{.*}}, name = "expr_elements2"{{.*}}elements = #llvm.di_generic_subrange, lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>, stride = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>>> +; CHECK-DAG: #llvm.di_subroutine_type @gv = external global i64 @@ -184,7 +185,7 @@ define void @composite_type() !dbg !3 { !2 = !DIFile(filename: "debug-info.ll", directory: "/") !3 = distinct !DISubprogram(name: "composite_type", scope: !2, file: !2, spFlags: DISPFlagDefinition, unit: !1, type: !4) !4 = !DISubroutineType(types: !5) -!5 = !{!7, !8, !9, !10, !18, !22} +!5 = !{!7, !8, !9, !10, !18, !22, !24} !6 = !DIBasicType(name: "int") !7 = !DICompositeType(tag: DW_TAG_array_type, name: "array1", line: 10, size: 128, align: 32, baseType: !6) !8 = !DICompositeType(tag: DW_TAG_array_type, name: "array2", file: !2, scope: !2, baseType: !6) @@ -203,6 +204,12 @@ define void @composite_type() !dbg !3 { !21 = !{!19} !22 = !DICompositeType(tag: DW_TAG_array_type, name: "expr_elements", flags: DIFlagVector, elements: !21, baseType: !6) !23 = !DIGlobalVariable(name: "gv", scope: !1, file: !2, line: 3, type: !6, isLocal: false, isDefinition: false) +!24 = !DICompositeType(tag: DW_TAG_array_type, name: "expr_elements2", elements: !29, baseType: !6) +!25 = !DIGenericSubrange(count: !26, lowerBound: !27, stride: !28) +!26 = !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 16, DW_OP_deref) +!27 = !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref) +!28 = !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref) +!29 = !{!25} ; // ----- diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir index b97bdcbc772d8..eac2c5090a5b5 100644 --- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir @@ -591,28 +591,40 @@ llvm.func @fn_with_composite() { // ----- -// Test that Subrange works with expression and variables. +// Test that Subrange/generic_subrange works with expression and variables. #bt = #llvm.di_basic_type #file = #llvm.di_file<"debug-info.ll" in "/"> #cu = #llvm.di_compile_unit, sourceLanguage = DW_LANG_Fortran95, file = #file, isOptimized = false, emissionKind = Full> +#exp1 = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(16), + DW_OP_deref]> #comp_ty1 = #llvm.di_composite_type>> -#srty = #llvm.di_subroutine_type + elements = #llvm.di_subrange> +#exp2 = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), + DW_OP_deref]> +#exp3 = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), + DW_OP_deref]> +#comp_ty2 = #llvm.di_composite_type> +#srty = #llvm.di_subroutine_type #sp = #llvm.di_subprogram #lvar = #llvm.di_local_variable #gv = #llvm.di_global_variable #gve = #llvm.di_global_variable_expression> -#comp_ty2 = #llvm.di_composite_type> -#lvar2 = #llvm.di_local_variable +#comp_ty4 = #llvm.di_composite_type> +#lvar2 = #llvm.di_local_variable +#lvar3 = #llvm.di_local_variable #loc1 = loc("test.f90": 1:1) #loc2 = loc(fused<#sp>[#loc1]) @@ -620,6 +632,7 @@ llvm.mlir.global external @gv() {dbg_exprs = [#gve]} : i64 llvm.func @subranges(%arg: !llvm.ptr) { llvm.intr.dbg.declare #lvar2 = %arg : !llvm.ptr + llvm.intr.dbg.declare #lvar3 = %arg : !llvm.ptr llvm.return } loc(#loc2) @@ -628,11 +641,19 @@ llvm.func @subranges(%arg: !llvm.ptr) { // CHECK: !DICompositeType(tag: DW_TAG_array_type, name: "expr_elements"{{.*}}elements: ![[ELEMENTS1:[0-9]+]]) // CHECK: ![[ELEMENTS1]] = !{![[ELEMENT1:[0-9]+]]} // CHECK: ![[ELEMENT1]] = !DISubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 16, DW_OP_deref)) +// CHECK: !DICompositeType(tag: DW_TAG_array_type, name: "expr_elements2"{{.*}}elements: ![[GSR_ELEMS:[0-9]+]]) +// CHECK: ![[GSR_ELEMS]] = !{![[GSR_ELEM:[0-9]+]]} +// CHECK: ![[GSR_ELEM]] = !DIGenericSubrange(count: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 16, DW_OP_deref) +// CHECK-SAME: lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref) +// CHECK-SAME: stride: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref) // CHECK: !DICompositeType(tag: DW_TAG_array_type, name: "var_elements"{{.*}}elements: ![[ELEMENTS2:[0-9]+]]) // CHECK: ![[ELEMENTS2]] = !{![[ELEMENT2:[0-9]+]]} -// CHECK: ![[ELEMENT2]] = !DISubrange(count: ![[LV:[0-9]+]], stride: ![[GV:[0-9]+]]) +// CHECK: ![[ELEMENT2]] = !DISubrange(count: ![[LV:[0-9]+]], stride: ![[GV]]) // CHECK: ![[LV]] = !DILocalVariable(name: "size"{{.*}}) +// CHECK: !DICompositeType(tag: DW_TAG_array_type, name: "var_elements2", baseType: !{{.*}}, elements: ![[GSR_ELEMS2:[0-9]+]]) +// CHECK: ![[GSR_ELEMS2]] = !{![[GSR_ELEM2:[0-9]+]]} +// CHECK: ![[GSR_ELEM2]] = !DIGenericSubrange(count: ![[LV]], lowerBound: ![[GV]], stride: ![[GV]]) // ----- diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir new file mode 100644 index 0000000000000..3c9bd9031c3e8 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -0,0 +1,684 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s + + +llvm.func @atomic_hint(%v : !llvm.ptr, %x : !llvm.ptr, %expr : i32) { + // expected-warning@below {{hint clause discarded}} + omp.atomic.capture hint(uncontended) { + omp.atomic.read %x = %v : !llvm.ptr, i32 + omp.atomic.write %v = %expr : !llvm.ptr, i32 + } + + // expected-warning@below {{hint clause discarded}} + omp.atomic.read %x = %v hint(contended) : !llvm.ptr, i32 + + // expected-warning@below {{hint clause discarded}} + omp.atomic.write %v = %expr hint(nonspeculative) : !llvm.ptr, i32 + + // expected-warning@below {{hint clause discarded}} + omp.atomic.update hint(speculative) %x : !llvm.ptr { + ^bb0(%arg0: i32): + %result = llvm.add %arg0, %expr : i32 + omp.yield(%result : i32) + } + + llvm.return +} + +// ----- + +llvm.func @cancel() { + // expected-error@below {{LLVM Translation failed for operation: omp.parallel}} + omp.parallel { + // expected-error@below {{unsupported OpenMP operation: omp.cancel}} + // expected-error@below {{LLVM Translation failed for operation: omp.cancel}} + omp.cancel cancellation_construct_type(parallel) + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @cancellation_point() { + // expected-error@below {{LLVM Translation failed for operation: omp.parallel}} + omp.parallel { + // expected-error@below {{unsupported OpenMP operation: omp.cancellation_point}} + // expected-error@below {{LLVM Translation failed for operation: omp.cancellation_point}} + omp.cancellation_point cancellation_construct_type(parallel) + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @do_simd(%lb : i32, %ub : i32, %step : i32) { + omp.wsloop { + // expected-warning@below {{simd information on composite construct discarded}} + omp.simd { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } {omp.composite} + } {omp.composite} + llvm.return +} + +// ----- + +llvm.func @distribute(%lb : i32, %ub : i32, %step : i32) { + // expected-error@below {{unsupported OpenMP operation: omp.distribute}} + // expected-error@below {{LLVM Translation failed for operation: omp.distribute}} + omp.distribute { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +llvm.func @ordered_region_par_level_simd() { + // expected-error@below {{parallelization-level clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.ordered.region}} + omp.ordered.region par_level_simd { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @parallel_allocate(%x : !llvm.ptr) { + // expected-error@below {{allocate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.parallel}} + omp.parallel allocate(%x : !llvm.ptr -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @sections_allocate(%x : !llvm.ptr) { + // expected-error@below {{allocate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.sections}} + omp.sections allocate(%x : !llvm.ptr -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.private {type = private} @x.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} +llvm.func @sections_private(%x : !llvm.ptr) { + // expected-error@below {{privatization clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.sections}} + omp.sections private(@x.privatizer %x -> %arg0 : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @simd_aligned(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{aligned clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.simd}} + omp.simd aligned(%x : !llvm.ptr -> 32) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +llvm.func @simd_linear(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{linear clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.simd}} + omp.simd linear(%x = %step : !llvm.ptr) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +llvm.func @simd_nontemporal(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{nontemporal clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.simd}} + omp.simd nontemporal(%x : !llvm.ptr) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +omp.private {type = private} @x.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} +llvm.func @simd_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{privatization clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.simd}} + omp.simd private(@x.privatizer %x -> %arg0 : !llvm.ptr) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> f32 + llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 + omp.yield +} +llvm.func @simd_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{reduction clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.simd}} + omp.simd reduction(@add_f32 %x -> %prv : !llvm.ptr) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +llvm.func @single_allocate(%x : !llvm.ptr) { + // expected-error@below {{allocate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.single}} + omp.single allocate(%x : !llvm.ptr -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.private {type = private} @x.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} +llvm.func @single_private(%x : !llvm.ptr) { + // expected-error@below {{privatization clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.single}} + omp.single private(@x.privatizer %x -> %arg0 : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_allocate(%x : !llvm.ptr) { + // expected-error@below {{allocate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target allocate(%x : !llvm.ptr -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_device(%x : i32) { + // expected-error@below {{device clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target device(%x : i32) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_has_device_addr(%x : !llvm.ptr) { + // expected-error@below {{has_device_addr clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target has_device_addr(%x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_if(%x : i1) { + // expected-error@below {{if clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target if(%x) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> f32 + llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 + omp.yield +} +llvm.func @target_in_reduction(%x : !llvm.ptr) { + // expected-error@below {{in_reduction clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target in_reduction(@add_f32 %x -> %prv : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_is_device_ptr(%x : !llvm.ptr) { + // expected-error@below {{is_device_ptr clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target is_device_ptr(%x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.private {type = firstprivate} @x.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + omp.yield(%arg0 : !llvm.ptr) +} copy { +^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + omp.yield(%arg0 : !llvm.ptr) +} +llvm.func @target_firstprivate(%x : !llvm.ptr) { + // expected-error@below {{firstprivate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target private(@x.privatizer %x -> %arg0 : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.private {type = private} @x.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + omp.yield(%arg0 : !llvm.ptr) +} dealloc { +^bb0(%arg0: !llvm.ptr): + omp.yield +} +llvm.func @target_struct_privatization(%x : !llvm.ptr) { + // expected-error@below {{privatization of structures not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target private(@x.privatizer %x -> %arg0 : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_thread_limit(%x : i32) { + // expected-error@below {{thread_limit clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target}} + omp.target thread_limit(%x : i32) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_enter_data_depend(%x: !llvm.ptr) { + // expected-error@below {{depend clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target_enter_data}} + omp.target_enter_data depend(taskdependin -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_exit_data_depend(%x: !llvm.ptr) { + // expected-error@below {{depend clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target_exit_data}} + omp.target_exit_data depend(taskdependin -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @target_update_depend(%x: !llvm.ptr) { + // expected-error@below {{depend clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.target_update}} + omp.target_update depend(taskdependin -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @task_allocate(%x : !llvm.ptr) { + // expected-error@below {{allocate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.task}} + omp.task allocate(%x : !llvm.ptr -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> f32 + llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 + omp.yield +} +llvm.func @task_in_reduction(%x : !llvm.ptr) { + // expected-error@below {{in_reduction clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.task}} + omp.task in_reduction(@add_f32 %x -> %prv : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @task_mergeable() { + // expected-error@below {{mergeable clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.task}} + omp.task mergeable { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @task_priority(%x : i32) { + // expected-error@below {{priority clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.task}} + omp.task priority(%x : i32) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.private {type = private} @x.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} +llvm.func @task_private(%x : !llvm.ptr) { + // expected-error@below {{privatization clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.task}} + omp.task private(@x.privatizer %x -> %arg0 : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @task_untied() { + // expected-error@below {{untied clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.task}} + omp.task untied { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @taskgroup_allocate(%x : !llvm.ptr) { + // expected-error@below {{allocate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.taskgroup}} + omp.taskgroup allocate(%x : !llvm.ptr -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> f32 + llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 + omp.yield +} +llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) { + // expected-error@below {{task_reduction clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.taskgroup}} + omp.taskgroup task_reduction(@add_f32 %x -> %prv : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @taskloop(%lb : i32, %ub : i32, %step : i32) { + // expected-error@below {{unsupported OpenMP operation: omp.taskloop}} + // expected-error@below {{LLVM Translation failed for operation: omp.taskloop}} + omp.taskloop { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +llvm.func @taskwait_depend(%x: !llvm.ptr) { + // expected-error@below {{depend clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.taskwait}} + omp.taskwait depend(taskdependin -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @taskwait_nowait() { + // expected-error@below {{nowait clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.taskwait}} + omp.taskwait nowait { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @teams_allocate(%x : !llvm.ptr) { + // expected-error@below {{allocate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.teams}} + omp.teams allocate(%x : !llvm.ptr -> %x : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.private {type = private} @x.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} +llvm.func @teams_private(%x : !llvm.ptr) { + // expected-error@below {{privatization clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.teams}} + omp.teams private(@x.privatizer %x -> %arg0 : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> f32 + llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 + omp.yield +} +llvm.func @teams_reduction(%x : !llvm.ptr) { + // expected-error@below {{reduction clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.teams}} + omp.teams reduction(@add_f32 %x -> %prv : !llvm.ptr) { + omp.terminator + } + llvm.return +} + +// ----- + +llvm.func @wsloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{allocate clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}} + omp.wsloop allocate(%x : !llvm.ptr -> %x : !llvm.ptr) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +llvm.func @wsloop_linear(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{linear clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}} + omp.wsloop linear(%x = %step : !llvm.ptr) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +llvm.func @wsloop_order(%lb : i32, %ub : i32, %step : i32) { + // expected-error@below {{order clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}} + omp.wsloop order(concurrent) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} + +// ----- + +omp.private {type = private} @x.privatizer : !llvm.ptr alloc { +^bb0(%arg0: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr + omp.yield(%1 : !llvm.ptr) +} +llvm.func @wsloop_private(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { + // expected-error@below {{privatization clause not yet supported}} + // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}} + omp.wsloop private(@x.privatizer %x -> %arg0 : !llvm.ptr) { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + } + llvm.return +} diff --git a/mlir/test/Transforms/test-operation-folder-commutative.mlir b/mlir/test/Transforms/test-operation-folder-commutative.mlir index 8ffdeb54f399d..55556c1ec5844 100644 --- a/mlir/test/Transforms/test-operation-folder-commutative.mlir +++ b/mlir/test/Transforms/test-operation-folder-commutative.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --pass-pipeline="builtin.module(test-patterns)" %s | FileCheck %s +// RUN: mlir-opt --pass-pipeline="builtin.module(test-greedy-patterns)" %s | FileCheck %s // CHECK-LABEL: func @test_reorder_constants_and_match func.func @test_reorder_constants_and_match(%arg0 : i32) -> (i32) { diff --git a/mlir/test/Transforms/test-operation-folder.mlir b/mlir/test/Transforms/test-operation-folder.mlir index 46ee07af993cc..3c0cd15dc6c51 100644 --- a/mlir/test/Transforms/test-operation-folder.mlir +++ b/mlir/test/Transforms/test-operation-folder.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt -test-patterns='top-down=false' %s | FileCheck %s -// RUN: mlir-opt -test-patterns='top-down=true' %s | FileCheck %s +// RUN: mlir-opt -test-greedy-patterns='top-down=false' %s | FileCheck %s +// RUN: mlir-opt -test-greedy-patterns='top-down=true' %s | FileCheck %s func.func @foo() -> i32 { %c42 = arith.constant 42 : i32 diff --git a/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp b/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp index 92216da9f201e..de511c58ae6ee 100644 --- a/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp +++ b/mlir/test/lib/Dialect/Func/TestDecomposeCallGraphTypes.cpp @@ -21,23 +21,40 @@ namespace { /// given tuple value. If some tuple elements are, in turn, tuples, the elements /// of those are extracted recursively such that the returned values have the /// same types as `resultTypes.getFlattenedTypes()`. -static LogicalResult buildDecomposeTuple(OpBuilder &builder, Location loc, - TupleType resultType, Value value, - SmallVectorImpl &values) { - for (unsigned i = 0, e = resultType.size(); i < e; ++i) { - Type elementType = resultType.getType(i); - Value element = builder.create( - loc, elementType, value, builder.getI32IntegerAttr(i)); - if (auto nestedTupleType = dyn_cast(elementType)) { - // Recurse if the current element is also a tuple. - if (failed(buildDecomposeTuple(builder, loc, nestedTupleType, element, - values))) - return failure(); - } else { - values.push_back(element); +static SmallVector buildDecomposeTuple(OpBuilder &builder, + TypeRange resultTypes, + ValueRange inputs, Location loc) { + // Skip materialization if the single input value is not a tuple. + if (inputs.size() != 1) + return {}; + Value tuple = inputs.front(); + auto tupleType = dyn_cast(tuple.getType()); + if (!tupleType) + return {}; + // Skip materialization if the flattened types do not match the requested + // result types. + SmallVector flattenedTypes; + tupleType.getFlattenedTypes(flattenedTypes); + if (TypeRange(resultTypes) != TypeRange(flattenedTypes)) + return {}; + // Recursively decompose the tuple. + SmallVector result; + std::function decompose = [&](Value tuple) { + auto tupleType = dyn_cast(tuple.getType()); + if (!tupleType) { + // This is not a tuple. + result.push_back(tuple); + return; } - } - return success(); + for (unsigned i = 0, e = tupleType.size(); i < e; ++i) { + Type elementType = tupleType.getType(i); + Value element = builder.create( + loc, elementType, tuple, builder.getI32IntegerAttr(i)); + decompose(element); + } + }; + decompose(tuple); + return result; } /// Creates a `test.make_tuple` op out of the given inputs building a tuple of @@ -82,8 +99,8 @@ static Value buildMakeTupleOp(OpBuilder &builder, TupleType resultType, /// A pass for testing call graph type decomposition. /// -/// This instantiates the patterns with a TypeConverter and ValueDecomposer -/// that splits tuple types into their respective element types. +/// This instantiates the patterns with a TypeConverter that splits tuple types +/// into their respective element types. /// For example, `tuple --> T1, T2, T3`. struct TestDecomposeCallGraphTypes : public PassWrapper> { @@ -123,12 +140,9 @@ struct TestDecomposeCallGraphTypes return success(); }); typeConverter.addArgumentMaterialization(buildMakeTupleOp); + typeConverter.addTargetMaterialization(buildDecomposeTuple); - ValueDecomposer decomposer; - decomposer.addDecomposeValueConversion(buildDecomposeTuple); - - populateDecomposeCallGraphTypesPatterns(context, typeConverter, decomposer, - patterns); + populateDecomposeCallGraphTypesPatterns(context, typeConverter, patterns); if (failed(applyPartialConversion(module, target, std::move(patterns)))) return signalPassFailure(); diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index 3eade0369f765..3df6cff3c0a60 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -13,12 +13,16 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Func/Transforms/FuncConversions.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" +#include "mlir/IR/Visitors.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/FoldUtils.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/WalkPatternRewriteDriver.h" #include "llvm/ADT/ScopeExit.h" +#include using namespace mlir; using namespace test; @@ -214,6 +218,30 @@ struct MoveBeforeParentOp : public RewritePattern { } }; +/// This pattern moves "test.move_after_parent_op" after the parent op. +struct MoveAfterParentOp : public RewritePattern { + MoveAfterParentOp(MLIRContext *context) + : RewritePattern("test.move_after_parent_op", /*benefit=*/1, context) {} + + LogicalResult matchAndRewrite(Operation *op, + PatternRewriter &rewriter) const override { + // Do not hoist past functions. + if (isa(op->getParentOp())) + return failure(); + + int64_t moveForwardBy = 0; + if (auto advanceBy = op->getAttrOfType("advance")) + moveForwardBy = advanceBy.getInt(); + + Operation *moveAfter = op->getParentOp(); + for (int64_t i = 0; i < moveForwardBy; ++i) + moveAfter = moveAfter->getNextNode(); + + rewriter.moveOpAfter(op, moveAfter); + return success(); + } +}; + /// This pattern inlines blocks that are nested in /// "test.inline_blocks_into_parent" into the parent block. struct InlineBlocksIntoParent : public RewritePattern { @@ -286,14 +314,63 @@ struct CloneRegionBeforeOp : public RewritePattern { } }; -struct TestPatternDriver - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestPatternDriver) +/// Replace an operation may introduce the re-visiting of its users. +class ReplaceWithNewOp : public RewritePattern { +public: + ReplaceWithNewOp(MLIRContext *context) + : RewritePattern("test.replace_with_new_op", /*benefit=*/1, context) {} + + LogicalResult matchAndRewrite(Operation *op, + PatternRewriter &rewriter) const override { + Operation *newOp; + if (op->hasAttr("create_erase_op")) { + newOp = rewriter.create( + op->getLoc(), + OperationName("test.erase_op", op->getContext()).getIdentifier(), + ValueRange(), TypeRange()); + } else { + newOp = rewriter.create( + op->getLoc(), + OperationName("test.new_op", op->getContext()).getIdentifier(), + op->getOperands(), op->getResultTypes()); + } + // "replaceOp" could be used instead of "replaceAllOpUsesWith"+"eraseOp". + // A "notifyOperationReplaced" callback is triggered in either case. + rewriter.replaceAllOpUsesWith(op, newOp->getResults()); + rewriter.eraseOp(op); + return success(); + } +}; + +/// Erases the first child block of the matched "test.erase_first_block" +/// operation. +class EraseFirstBlock : public RewritePattern { +public: + EraseFirstBlock(MLIRContext *context) + : RewritePattern("test.erase_first_block", /*benefit=*/1, context) {} + + LogicalResult matchAndRewrite(Operation *op, + PatternRewriter &rewriter) const override { + for (Region &r : op->getRegions()) { + for (Block &b : r.getBlocks()) { + rewriter.eraseBlock(&b); + return success(); + } + } + + return failure(); + } +}; + +struct TestGreedyPatternDriver + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestGreedyPatternDriver) - TestPatternDriver() = default; - TestPatternDriver(const TestPatternDriver &other) : PassWrapper(other) {} + TestGreedyPatternDriver() = default; + TestGreedyPatternDriver(const TestGreedyPatternDriver &other) + : PassWrapper(other) {} - StringRef getArgument() const final { return "test-patterns"; } + StringRef getArgument() const final { return "test-greedy-patterns"; } StringRef getDescription() const final { return "Run test dialect patterns"; } void runOnOperation() override { mlir::RewritePatternSet patterns(&getContext()); @@ -470,34 +547,6 @@ struct TestStrictPatternDriver } }; - // Replace an operation may introduce the re-visiting of its users. - class ReplaceWithNewOp : public RewritePattern { - public: - ReplaceWithNewOp(MLIRContext *context) - : RewritePattern("test.replace_with_new_op", /*benefit=*/1, context) {} - - LogicalResult matchAndRewrite(Operation *op, - PatternRewriter &rewriter) const override { - Operation *newOp; - if (op->hasAttr("create_erase_op")) { - newOp = rewriter.create( - op->getLoc(), - OperationName("test.erase_op", op->getContext()).getIdentifier(), - ValueRange(), TypeRange()); - } else { - newOp = rewriter.create( - op->getLoc(), - OperationName("test.new_op", op->getContext()).getIdentifier(), - op->getOperands(), op->getResultTypes()); - } - // "replaceOp" could be used instead of "replaceAllOpUsesWith"+"eraseOp". - // A "notifyOperationReplaced" callback is triggered in either case. - rewriter.replaceAllOpUsesWith(op, newOp->getResults()); - rewriter.eraseOp(op); - return success(); - } - }; - // Remove an operation may introduce the re-visiting of its operands. class EraseOp : public RewritePattern { public: @@ -560,6 +609,39 @@ struct TestStrictPatternDriver }; }; +struct TestWalkPatternDriver final + : PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestWalkPatternDriver) + + TestWalkPatternDriver() = default; + TestWalkPatternDriver(const TestWalkPatternDriver &other) + : PassWrapper(other) {} + + StringRef getArgument() const override { + return "test-walk-pattern-rewrite-driver"; + } + StringRef getDescription() const override { + return "Run test walk pattern rewrite driver"; + } + void runOnOperation() override { + mlir::RewritePatternSet patterns(&getContext()); + + // Patterns for testing the WalkPatternRewriteDriver. + patterns.add, MoveBeforeParentOp, + MoveAfterParentOp, CloneOp, ReplaceWithNewOp, EraseFirstBlock>( + &getContext()); + + DumpNotifications dumpListener; + walkAndApplyPatterns(getOperation(), std::move(patterns), + dumpNotifications ? &dumpListener : nullptr); + } + + Option dumpNotifications{ + *this, "dump-notifications", + llvm::cl::desc("Print rewrite listener notifications"), + llvm::cl::init(false)}; +}; + } // namespace //===----------------------------------------------------------------------===// @@ -1978,8 +2060,9 @@ void registerPatternsTestPass() { PassRegistration(); - PassRegistration(); + PassRegistration(); PassRegistration(); + PassRegistration(); PassRegistration([] { return std::make_unique(legalizerConversionMode); diff --git a/mlir/test/mlir-tblgen/op-python-bindings.td b/mlir/test/mlir-tblgen/op-python-bindings.td index ba85cb8406b31..632046389e12c 100644 --- a/mlir/test/mlir-tblgen/op-python-bindings.td +++ b/mlir/test/mlir-tblgen/op-python-bindings.td @@ -60,7 +60,7 @@ def AttrSizedOperandsOp : TestOp<"attr_sized_operands", } // CHECK: def attr_sized_operands(variadic1, non_variadic, *, variadic2=None, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(AttrSizedOperandsOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip)) +// CHECK: return AttrSizedOperandsOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip) // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class AttrSizedResultsOp(_ods_ir.OpView): @@ -157,7 +157,7 @@ def AttributedOp : TestOp<"attributed_op"> { } // CHECK: def attributed_op(i32attr, in_, *, optional_f32_attr=None, unit_attr=None, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(AttributedOp(i32attr=i32attr, in_=in_, optionalF32Attr=optional_f32_attr, unitAttr=unit_attr, loc=loc, ip=ip)) +// CHECK: return AttributedOp(i32attr=i32attr, in_=in_, optionalF32Attr=optional_f32_attr, unitAttr=unit_attr, loc=loc, ip=ip) // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class AttributedOpWithOperands(_ods_ir.OpView): @@ -193,7 +193,7 @@ def AttributedOpWithOperands : TestOp<"attributed_op_with_operands"> { } // CHECK: def attributed_op_with_operands(_gen_arg_0, _gen_arg_2, *, in_=None, is_=None, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(AttributedOpWithOperands(_gen_arg_0=_gen_arg_0, _gen_arg_2=_gen_arg_2, in_=in_, is_=is_, loc=loc, ip=ip)) +// CHECK: return AttributedOpWithOperands(_gen_arg_0=_gen_arg_0, _gen_arg_2=_gen_arg_2, in_=in_, is_=is_, loc=loc, ip=ip) // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class DefaultValuedAttrsOp(_ods_ir.OpView): @@ -217,7 +217,7 @@ def DefaultValuedAttrsOp : TestOp<"default_valued_attrs"> { } // CHECK: def default_valued_attrs(*, arr=None, unsupported=None, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(DefaultValuedAttrsOp(arr=arr, unsupported=unsupported, loc=loc, ip=ip)) +// CHECK: return DefaultValuedAttrsOp(arr=arr, unsupported=unsupported, loc=loc, ip=ip) // CHECK-LABEL: OPERATION_NAME = "test.derive_result_types_op" def DeriveResultTypesOp : TestOp<"derive_result_types_op", [FirstAttrDerivedResultType]> { @@ -235,7 +235,7 @@ def DeriveResultTypesOp : TestOp<"derive_result_types_op", [FirstAttrDerivedResu } // CHECK: def derive_result_types_op(type_, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(DeriveResultTypesOp(type_=type_, loc=loc, ip=ip)) +// CHECK: return DeriveResultTypesOp(type_=type_, loc=loc, ip=ip).results // CHECK-LABEL: OPERATION_NAME = "test.derive_result_types_variadic_op" def DeriveResultTypesVariadicOp : TestOp<"derive_result_types_variadic_op", [FirstAttrDerivedResultType]> { @@ -262,7 +262,7 @@ def EmptyOp : TestOp<"empty">; // CHECK: successors=_ods_successors, regions=regions, loc=loc, ip=ip)) // CHECK: def empty(*, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(EmptyOp(loc=loc, ip=ip)) +// CHECK: return EmptyOp(loc=loc, ip=ip) // CHECK-LABEL: OPERATION_NAME = "test.infer_result_types_implied_op" def InferResultTypesImpliedOp : TestOp<"infer_result_types_implied_op"> { @@ -275,7 +275,7 @@ def InferResultTypesImpliedOp : TestOp<"infer_result_types_implied_op"> { } // CHECK: def infer_result_types_implied_op(*, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(InferResultTypesImpliedOp(loc=loc, ip=ip)) +// CHECK: return InferResultTypesImpliedOp(loc=loc, ip=ip).results // CHECK-LABEL: OPERATION_NAME = "test.infer_result_types_op" def InferResultTypesOp : TestOp<"infer_result_types_op", [InferTypeOpInterface]> { @@ -288,7 +288,7 @@ def InferResultTypesOp : TestOp<"infer_result_types_op", [InferTypeOpInterface]> } // CHECK: def infer_result_types_op(*, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(InferResultTypesOp(loc=loc, ip=ip)) +// CHECK: return InferResultTypesOp(loc=loc, ip=ip).results // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class MissingNamesOp(_ods_ir.OpView): @@ -326,7 +326,7 @@ def MissingNamesOp : TestOp<"missing_names"> { } // CHECK: def missing_names(i32, _gen_res_1, i64, _gen_arg_0, f32, _gen_arg_2, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(MissingNamesOp(i32=i32, _gen_res_1=_gen_res_1, i64=i64, _gen_arg_0=_gen_arg_0, f32=f32, _gen_arg_2=_gen_arg_2, loc=loc, ip=ip)) +// CHECK: return MissingNamesOp(i32=i32, _gen_res_1=_gen_res_1, i64=i64, _gen_arg_0=_gen_arg_0, f32=f32, _gen_arg_2=_gen_arg_2, loc=loc, ip=ip).results // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class OneOptionalOperandOp(_ods_ir.OpView): @@ -357,7 +357,7 @@ def OneOptionalOperandOp : TestOp<"one_optional_operand"> { } // CHECK: def one_optional_operand(non_optional, *, optional=None, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(OneOptionalOperandOp(non_optional=non_optional, optional=optional, loc=loc, ip=ip)) +// CHECK: return OneOptionalOperandOp(non_optional=non_optional, optional=optional, loc=loc, ip=ip) // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class OneVariadicOperandOp(_ods_ir.OpView): @@ -389,7 +389,7 @@ def OneVariadicOperandOp : TestOp<"one_variadic_operand"> { } // CHECK: def one_variadic_operand(non_variadic, variadic, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(OneVariadicOperandOp(non_variadic=non_variadic, variadic=variadic, loc=loc, ip=ip)) +// CHECK: return OneVariadicOperandOp(non_variadic=non_variadic, variadic=variadic, loc=loc, ip=ip) // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class OneVariadicResultOp(_ods_ir.OpView): @@ -446,7 +446,7 @@ def PythonKeywordOp : TestOp<"python_keyword"> { } // CHECK: def python_keyword(in_, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(PythonKeywordOp(in_=in_, loc=loc, ip=ip)) +// CHECK: return PythonKeywordOp(in_=in_, loc=loc, ip=ip) // CHECK-LABEL: OPERATION_NAME = "test.same_results" def SameResultsOp : TestOp<"same_results", [SameOperandsAndResultType]> { @@ -460,7 +460,7 @@ def SameResultsOp : TestOp<"same_results", [SameOperandsAndResultType]> { } // CHECK: def same_results(in1, in2, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(SameResultsOp(in1=in1, in2=in2, loc=loc, ip=ip)) +// CHECK: return SameResultsOp(in1=in1, in2=in2, loc=loc, ip=ip) // CHECK-LABEL: OPERATION_NAME = "test.same_results_variadic" def SameResultsVariadicOp : TestOp<"same_results_variadic", [SameOperandsAndResultType]> { @@ -497,7 +497,7 @@ def SameVariadicOperandSizeOp : TestOp<"same_variadic_operand", } // CHECK: def same_variadic_operand(variadic1, non_variadic, variadic2, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(SameVariadicOperandSizeOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip)) +// CHECK: return SameVariadicOperandSizeOp(variadic1=variadic1, non_variadic=non_variadic, variadic2=variadic2, loc=loc, ip=ip) // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class SameVariadicResultSizeOp(_ods_ir.OpView): @@ -563,7 +563,7 @@ def SimpleOp : TestOp<"simple"> { } // CHECK: def simple(i64, f64, i32, f32, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(SimpleOp(i64=i64, f64=f64, i32=i32, f32=f32, loc=loc, ip=ip)) +// CHECK: return SimpleOp(i64=i64, f64=f64, i32=i32, f32=f32, loc=loc, ip=ip).results // CHECK: class VariadicAndNormalRegionOp(_ods_ir.OpView): // CHECK-LABEL: OPERATION_NAME = "test.variadic_and_normal_region" @@ -590,7 +590,7 @@ def VariadicAndNormalRegionOp : TestOp<"variadic_and_normal_region"> { } // CHECK: def variadic_and_normal_region(num_variadic, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(VariadicAndNormalRegionOp(num_variadic=num_variadic, loc=loc, ip=ip)) +// CHECK: return VariadicAndNormalRegionOp(num_variadic=num_variadic, loc=loc, ip=ip) // CHECK: class VariadicRegionOp(_ods_ir.OpView): // CHECK-LABEL: OPERATION_NAME = "test.variadic_region" @@ -613,7 +613,7 @@ def VariadicRegionOp : TestOp<"variadic_region"> { } // CHECK: def variadic_region(num_variadic, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(VariadicRegionOp(num_variadic=num_variadic, loc=loc, ip=ip)) +// CHECK: return VariadicRegionOp(num_variadic=num_variadic, loc=loc, ip=ip) // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class WithSpecialCharactersOp(_ods_ir.OpView): @@ -622,7 +622,7 @@ def WithSpecialCharactersOp : TestOp<"123with--special.characters"> { } // CHECK: def _123with__special_characters(*, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(WithSpecialCharactersOp(loc=loc, ip=ip)) +// CHECK: return WithSpecialCharactersOp(loc=loc, ip=ip) // CHECK: @_ods_cext.register_operation(_Dialect) // CHECK: class WithSuccessorsOp(_ods_ir.OpView): @@ -637,4 +637,4 @@ def WithSuccessorsOp : TestOp<"with_successors"> { } // CHECK: def with_successors(successor, successors, *, loc=None, ip=None) -// CHECK: return _get_op_result_or_op_results(WithSuccessorsOp(successor=successor, successors=successors, loc=loc, ip=ip)) +// CHECK: return WithSuccessorsOp(successor=successor, successors=successors, loc=loc, ip=ip) diff --git a/mlir/test/mlir-tblgen/pattern.mlir b/mlir/test/mlir-tblgen/pattern.mlir index 5ff8710b93770..60d46e676d2a3 100644 --- a/mlir/test/mlir-tblgen/pattern.mlir +++ b/mlir/test/mlir-tblgen/pattern.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -test-patterns -mlir-print-debuginfo -mlir-print-local-scope %s | FileCheck %s +// RUN: mlir-opt -test-greedy-patterns -mlir-print-debuginfo -mlir-print-local-scope %s | FileCheck %s // CHECK-LABEL: verifyFusedLocs func.func @verifyFusedLocs(%arg0 : i32) -> i32 { diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp index 0c5c936f5adde..5019b69d91127 100644 --- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp +++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp @@ -271,6 +271,11 @@ constexpr const char *regionAccessorTemplate = R"Py( )Py"; constexpr const char *valueBuilderTemplate = R"Py( +def {0}({2}) -> {4}: + return {1}({3}){5} +)Py"; + +constexpr const char *valueBuilderVariadicTemplate = R"Py( def {0}({2}) -> {4}: return _get_op_result_or_op_results({1}({3})) )Py"; @@ -992,15 +997,29 @@ static void emitValueBuilder(const Operator &op, auto lhs = *llvm::split(arg, "=").begin(); return (lhs + "=" + llvm::convertToSnakeFromCamelCase(lhs)).str(); }); - std::string nameWithoutDialect = - op.getOperationName().substr(op.getOperationName().find('.') + 1); - os << formatv( - valueBuilderTemplate, sanitizeName(nameWithoutDialect), - op.getCppClassName(), llvm::join(valueBuilderParams, ", "), - llvm::join(opBuilderArgs, ", "), + std::string nameWithoutDialect = sanitizeName( + op.getOperationName().substr(op.getOperationName().find('.') + 1)); + std::string params = llvm::join(valueBuilderParams, ", "); + std::string args = llvm::join(opBuilderArgs, ", "); + const char *type = (op.getNumResults() > 1 ? "_Sequence[_ods_ir.Value]" - : (op.getNumResults() > 0 ? "_ods_ir.Value" : "_ods_ir.Operation"))); + : (op.getNumResults() > 0 ? "_ods_ir.Value" : "_ods_ir.Operation")); + if (op.getNumVariableLengthResults() > 0) { + os << formatv(valueBuilderVariadicTemplate, nameWithoutDialect, + op.getCppClassName(), params, args, type); + } else { + const char *results; + if (op.getNumResults() == 0) { + results = ""; + } else if (op.getNumResults() == 1) { + results = ".result"; + } else { + results = ".results"; + } + os << formatv(valueBuilderTemplate, nameWithoutDialect, + op.getCppClassName(), params, args, type, results); + } } /// Emits bindings for a specific Op to the given output stream. diff --git a/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp b/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp index b667785c16f16..db294b8b040e9 100644 --- a/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp +++ b/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp @@ -99,9 +99,9 @@ struct CustomDataLayoutSpec class TargetSystemSpecStorage : public AttributeStorage { public: - using KeyTy = ArrayRef; + using KeyTy = ArrayRef; - TargetSystemSpecStorage(ArrayRef entries) + TargetSystemSpecStorage(ArrayRef entries) : entries(entries) {} bool operator==(const KeyTy &key) const { return key == entries; } @@ -112,7 +112,7 @@ class TargetSystemSpecStorage : public AttributeStorage { TargetSystemSpecStorage(allocator.copyInto(key)); } - ArrayRef entries; + ArrayRef entries; }; struct CustomTargetSystemSpec @@ -126,18 +126,20 @@ struct CustomTargetSystemSpec static constexpr StringLiteral name = "test.custom_target_system_spec"; static CustomTargetSystemSpec - get(MLIRContext *ctx, ArrayRef entries) { + get(MLIRContext *ctx, ArrayRef entries) { return Base::get(ctx, entries); } - DeviceIDTargetDeviceSpecPairListRef getEntries() const { + ArrayRef getEntries() const { return getImpl()->entries; } LogicalResult verifySpec(Location loc) { return success(); } std::optional getDeviceSpecForDeviceID(TargetSystemSpecInterface::DeviceID deviceID) { for (const auto &entry : getEntries()) { - if (entry.first == deviceID) - return entry.second; + if (entry.getKey() == DataLayoutEntryKey(deviceID)) + if (auto deviceSpec = + llvm::dyn_cast(entry.getValue())) + return deviceSpec; } return std::nullopt; } @@ -388,9 +390,11 @@ struct DLTargetSystemDescTestDialect : public Dialect { void printAttribute(Attribute attr, DialectAsmPrinter &printer) const override { printer << "target_system_spec<"; - llvm::interleaveComma( - cast(attr).getEntries(), printer, - [&](const auto &it) { printer << it.first << ":" << it.second; }); + llvm::interleaveComma(cast(attr).getEntries(), + printer, [&](const auto &it) { + printer << dyn_cast(it.getKey()) << ":" + << it.getValue(); + }); printer << ">"; } @@ -402,8 +406,8 @@ struct DLTargetSystemDescTestDialect : public Dialect { if (succeeded(parser.parseOptionalGreater())) return CustomTargetSystemSpec::get(parser.getContext(), {}); - auto parseDeviceIDTargetDeviceSpecPair = - [&](AsmParser &parser) -> FailureOr { + auto parseTargetDeviceSpecEntry = + [&](AsmParser &parser) -> FailureOr { std::string deviceID; if (failed(parser.parseString(&deviceID))) { parser.emitError(parser.getCurrentLocation()) @@ -425,13 +429,15 @@ struct DLTargetSystemDescTestDialect : public Dialect { targetDeviceSpec); }; - SmallVector entries; + SmallVector entries; ok = succeeded(parser.parseCommaSeparatedList([&]() { - auto deviceIDAndTargetDeviceSpecPair = - parseDeviceIDTargetDeviceSpecPair(parser); + auto deviceIDAndTargetDeviceSpecPair = parseTargetDeviceSpecEntry(parser); ok = succeeded(deviceIDAndTargetDeviceSpecPair); assert(ok); - entries.push_back(*deviceIDAndTargetDeviceSpecPair); + auto entry = + DataLayoutEntryAttr::get(deviceIDAndTargetDeviceSpecPair->first, + deviceIDAndTargetDeviceSpecPair->second); + entries.push_back(entry); return success(); })); assert(ok); diff --git a/mlir/utils/spirv/gen_spirv_dialect.py b/mlir/utils/spirv/gen_spirv_dialect.py index 2fb540ef10325..99ed3489b4cbd 100755 --- a/mlir/utils/spirv/gen_spirv_dialect.py +++ b/mlir/utils/spirv/gen_spirv_dialect.py @@ -127,44 +127,6 @@ def split_list_into_sublists(items): return chuncks -def uniquify_enum_cases(lst): - """Prunes duplicate enum cases from the list. - - Arguments: - - lst: List whose elements are to be uniqued. Assumes each element is a - (symbol, value) pair and elements already sorted according to value. - - Returns: - - A list with all duplicates removed. The elements are sorted according to - value and, for each value, uniqued according to symbol. - original list, - - A map from deduplicated cases to the uniqued case. - """ - cases = lst - uniqued_cases = [] - duplicated_cases = {} - - # First sort according to the value - cases.sort(key=lambda x: x[1]) - - # Then group them according to the value - for _, groups in itertools.groupby(cases, key=lambda x: x[1]): - # For each value, sort according to the enumerant symbol. - sorted_group = sorted(groups, key=lambda x: x[0]) - # Keep the "smallest" case, which is typically the symbol without extension - # suffix. But we have special cases that we want to fix. - case = sorted_group[0] - for i in range(1, len(sorted_group)): - duplicated_cases[sorted_group[i][0]] = case[0] - if case[0] == "HlslSemanticGOOGLE": - assert len(sorted_group) == 2, "unexpected new variant for HlslSemantic" - case = sorted_group[1] - duplicated_cases[sorted_group[0][0]] = case[0] - uniqued_cases.append(case) - - return uniqued_cases, duplicated_cases - - def toposort(dag, sort_fn): """Topologically sorts the given dag. @@ -197,14 +159,12 @@ def get_next_batch(dag): return sorted_nodes -def toposort_capabilities(all_cases, capability_mapping): +def toposort_capabilities(all_cases): """Returns topologically sorted capability (symbol, value) pairs. Arguments: - all_cases: all capability cases (containing symbol, value, and implied capabilities). - - capability_mapping: mapping from duplicated capability symbols to the - canonicalized symbol chosen for SPIRVBase.td. Returns: A list containing topologically sorted capability (symbol, value) pairs. @@ -215,13 +175,10 @@ def toposort_capabilities(all_cases, capability_mapping): # Get the current capability. cur = case["enumerant"] name_to_value[cur] = case["value"] - # Ignore duplicated symbols. - if cur in capability_mapping: - continue # Get capabilities implied by the current capability. prev = case.get("capabilities", []) - uniqued_prev = set([capability_mapping.get(c, c) for c in prev]) + uniqued_prev = set(prev) dag[cur] = uniqued_prev sorted_caps = toposort(dag, lambda x: name_to_value[x]) @@ -229,36 +186,12 @@ def toposort_capabilities(all_cases, capability_mapping): return [(c, name_to_value[c]) for c in sorted_caps] -def get_capability_mapping(operand_kinds): - """Returns the capability mapping from duplicated cases to canonicalized ones. - - Arguments: - - operand_kinds: all operand kinds' grammar spec - - Returns: - - A map mapping from duplicated capability symbols to the canonicalized - symbol chosen for SPIRVBase.td. - """ - # Find the operand kind for capability - cap_kind = {} - for kind in operand_kinds: - if kind["kind"] == "Capability": - cap_kind = kind - - kind_cases = [(case["enumerant"], case["value"]) for case in cap_kind["enumerants"]] - _, capability_mapping = uniquify_enum_cases(kind_cases) - - return capability_mapping - - -def get_availability_spec(enum_case, capability_mapping, for_op, for_cap): +def get_availability_spec(enum_case, for_op, for_cap): """Returns the availability specification string for the given enum case. Arguments: - enum_case: the enum case to generate availability spec for. It may contain 'version', 'lastVersion', 'extensions', or 'capabilities'. - - capability_mapping: mapping from duplicated capability symbols to the - canonicalized symbol chosen for SPIRVBase.td. - for_op: bool value indicating whether this is the availability spec for an op itself. - for_cap: bool value indicating whether this is the availability spec for @@ -313,10 +246,7 @@ def get_availability_spec(enum_case, capability_mapping, for_op, for_cap): if caps: canonicalized_caps = [] for c in caps: - if c in capability_mapping: - canonicalized_caps.append(capability_mapping[c]) - else: - canonicalized_caps.append(c) + canonicalized_caps.append(c) prefixed_caps = [ "SPIRV_C_{}".format(c) for c in sorted(set(canonicalized_caps)) ] @@ -357,7 +287,7 @@ def get_availability_spec(enum_case, capability_mapping, for_op, for_cap): return "{}{}{}".format(implies, "\n " if implies and avail else "", avail) -def gen_operand_kind_enum_attr(operand_kind, capability_mapping): +def gen_operand_kind_enum_attr(operand_kind): """Generates the TableGen EnumAttr definition for the given operand kind. Returns: @@ -388,13 +318,12 @@ def get_case_symbol(kind_name, case_name): # Special treatment for capability cases: we need to sort them topologically # because a capability can refer to another via the 'implies' field. kind_cases = toposort_capabilities( - operand_kind["enumerants"], capability_mapping + operand_kind["enumerants"] ) else: kind_cases = [ (case["enumerant"], case["value"]) for case in operand_kind["enumerants"] ] - kind_cases, _ = uniquify_enum_cases(kind_cases) max_len = max([len(symbol) for (symbol, _) in kind_cases]) # Generate the definition for each enum case @@ -412,7 +341,6 @@ def get_case_symbol(kind_name, case_name): value = int(case_pair[1]) avail = get_availability_spec( name_to_case_dict[name], - capability_mapping, False, kind_name == "Capability", ) @@ -648,11 +576,9 @@ def update_td_enum_attrs(path, operand_kinds, filter_list): ] filter_list.extend(existing_kinds) - capability_mapping = get_capability_mapping(operand_kinds) - # Generate definitions for all enums in filter list defs = [ - gen_operand_kind_enum_attr(kind, capability_mapping) + gen_operand_kind_enum_attr(kind) for kind in operand_kinds if kind["kind"] in filter_list ] @@ -762,7 +688,7 @@ def get_description(text, appendix): def get_op_definition( - instruction, opname, doc, existing_info, capability_mapping, settings + instruction, opname, doc, existing_info, settings ): """Generates the TableGen op definition for the given SPIR-V instruction. @@ -771,8 +697,6 @@ def get_op_definition( - doc: the instruction's SPIR-V HTML doc - existing_info: a dict containing potential manually specified sections for this instruction - - capability_mapping: mapping from duplicated capability symbols to the - canonicalized symbol chosen for SPIRVBase.td Returns: - A string containing the TableGen op definition @@ -840,7 +764,7 @@ def get_op_definition( operands = instruction.get("operands", []) # Op availability - avail = get_availability_spec(instruction, capability_mapping, True, False) + avail = get_availability_spec(instruction, True, False) if avail: avail = "\n\n {0}".format(avail) @@ -989,6 +913,7 @@ def extract_td_op_info(op_def): op_tmpl_params, _ = get_string_between_nested(op_def, "<", ">") opstringname, rest = get_string_between(op_tmpl_params, '"', '"') category_args = rest.split("[", 1)[0] + category_args = category_args.rsplit(",", 1)[0] # Get traits traits, _ = get_string_between_nested(rest, "[", "]") @@ -1020,7 +945,7 @@ def extract_td_op_info(op_def): def update_td_op_definitions( - path, instructions, docs, filter_list, inst_category, capability_mapping, settings + path, instructions, docs, filter_list, inst_category, settings ): """Updates SPIRVOps.td with newly generated op definition. @@ -1029,8 +954,6 @@ def update_td_op_definitions( - instructions: SPIR-V JSON grammar for all instructions - docs: SPIR-V HTML doc for all instructions - filter_list: a list containing new opnames to include - - capability_mapping: mapping from duplicated capability symbols to the - canonicalized symbol chosen for SPIRVBase.td. Returns: - A string containing all the TableGen op definitions @@ -1078,7 +1001,6 @@ def update_td_op_definitions( opname, docs[fixed_opname], op_info_dict.get(opname, {"inst_category": inst_category}), - capability_mapping, settings, ) ) @@ -1185,14 +1107,12 @@ def update_td_op_definitions( if args.new_inst is not None: assert args.op_td_path is not None docs = get_spirv_doc_from_html_spec(ext_html_url, args) - capability_mapping = get_capability_mapping(operand_kinds) update_td_op_definitions( args.op_td_path, instructions, docs, args.new_inst, args.inst_category, - capability_mapping, args, ) print("Done. Note that this script just generates a template; ", end="") diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp index 79db3965de023..73c26578005c3 100644 --- a/polly/lib/Analysis/ScopDetection.cpp +++ b/polly/lib/Analysis/ScopDetection.cpp @@ -1698,6 +1698,8 @@ bool ScopDetection::hasPossiblyDistributableLoop( DetectionContext &Context) const { for (auto *BB : Context.CurRegion.blocks()) { auto *L = LI.getLoopFor(BB); + if (!L) + continue; if (!Context.CurRegion.contains(L)) continue; if (Context.BoxedLoopsSet.count(L)) diff --git a/polly/test/ScopDetect/detect-full-functions.ll b/polly/test/ScopDetect/detect-full-functions.ll new file mode 100644 index 0000000000000..178ef32827cab --- /dev/null +++ b/polly/test/ScopDetect/detect-full-functions.ll @@ -0,0 +1,17 @@ +; RUN: opt %loadNPMPolly '-passes=print' -polly-process-unprofitable=false -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s + +; Verify if a simple function with basic block not part of loop doesn't crash with polly-process-unprofitable=false and polly-detect-full-functions flags. + +; CHECK: Detected Scops in Function foo + +define void @foo() { + br label %1 + +1: ; preds = %1, %0 + br i1 false, label %2, label %1 + +2: ; preds = %1 + %3 = load ptr, ptr null, align 8 + store ptr null, ptr null, align 8 + ret void +} diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index d4aeaea6fac84..820163415f98b 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1702,6 +1702,17 @@ libc_support_library( ], ) +libc_support_library( + name = "sincosf16_utils", + hdrs = ["src/math/generic/sincosf16_utils.h"], + deps = [ + ":__support_common", + ":__support_fputil_fp_bits", + ":__support_fputil_nearest_integer", + ":__support_fputil_polyeval", + ], +) + libc_support_library( name = "explogxf", srcs = ["src/math/generic/explogxf.cpp"], @@ -1773,6 +1784,18 @@ libc_support_library( ], ) +libc_support_library( + name = "expxf16", + hdrs = ["src/math/generic/expxf16.h"], + deps = [ + ":__support_cpp_array", + ":__support_fputil_cast", + ":__support_fputil_fp_bits", + ":__support_fputil_nearest_integer", + ":__support_fputil_polyeval", + ], +) + ################################ math targets ################################## libc_math_function( @@ -1882,6 +1905,8 @@ libc_math_function(name = "canonicalizel") libc_math_function(name = "canonicalizef128") +libc_math_function(name = "canonicalizef16") + libc_math_function( name = "cbrt", additional_deps = [ @@ -1906,6 +1931,8 @@ libc_math_function(name = "ceill") libc_math_function(name = "ceilf128") +libc_math_function(name = "ceilf16") + libc_math_function(name = "copysign") libc_math_function(name = "copysignf") @@ -1914,6 +1941,8 @@ libc_math_function(name = "copysignl") libc_math_function(name = "copysignf128") +libc_math_function(name = "copysignf16") + libc_math_function( name = "cos", additional_deps = [ @@ -1950,6 +1979,13 @@ libc_math_function( ], ) +libc_math_function( + name = "coshf16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function( name = "cospif", additional_deps = [ @@ -1965,6 +2001,15 @@ libc_math_function( ], ) +libc_math_function( + name = "cospif16", + additional_deps = [ + ":__support_fputil_multiply_add", + ":__support_macros_optimization", + ":sincosf16_utils", + ], +) + libc_math_function(name = "daddl") libc_math_function(name = "daddf128") @@ -2048,6 +2093,13 @@ libc_math_function( ], ) +libc_math_function( + name = "expf16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function( name = "exp10", additional_deps = [ @@ -2072,6 +2124,20 @@ libc_math_function( ], ) +libc_math_function( + name = "exp10f16", + additional_deps = [ + ":expxf16", + ], +) + +libc_math_function( + name = "exp10m1f16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function( name = "exp2", additional_deps = [ @@ -2096,6 +2162,13 @@ libc_math_function( ], ) +libc_math_function( + name = "exp2f16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function( name = "exp2m1f", additional_deps = [ @@ -2104,6 +2177,13 @@ libc_math_function( ], ) +libc_math_function( + name = "exp2m1f16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function( name = "expm1", additional_deps = [ @@ -2135,6 +2215,101 @@ libc_math_function( ], ) +libc_math_function( + name = "expm1f16", + additional_deps = [ + ":expxf16", + ], +) + +libc_math_function(name = "f16add") + +libc_math_function(name = "f16addf") + +libc_math_function(name = "f16addf128") + +libc_math_function(name = "f16addl") + +libc_math_function(name = "f16div") + +libc_math_function(name = "f16divf") + +libc_math_function(name = "f16divf128") + +libc_math_function(name = "f16divl") + +libc_math_function( + name = "f16fma", + additional_deps = [ + ":__support_fputil_fma", + ], +) + +libc_math_function( + name = "f16fmaf", + additional_deps = [ + ":__support_fputil_fma", + ], +) + +libc_math_function( + name = "f16fmaf128", + additional_deps = [ + ":__support_fputil_fma", + ], +) + +libc_math_function( + name = "f16fmal", + additional_deps = [ + ":__support_fputil_fma", + ], +) + +libc_math_function(name = "f16mul") + +libc_math_function(name = "f16mulf") + +libc_math_function(name = "f16mulf128") + +libc_math_function(name = "f16mull") + +libc_math_function( + name = "f16sqrt", + additional_deps = [ + ":__support_fputil_sqrt", + ], +) + +libc_math_function( + name = "f16sqrtf", + additional_deps = [ + ":__support_fputil_sqrt", + ], +) + +libc_math_function( + name = "f16sqrtf128", + additional_deps = [ + ":__support_fputil_sqrt", + ], +) + +libc_math_function( + name = "f16sqrtl", + additional_deps = [ + ":__support_fputil_sqrt", + ], +) + +libc_math_function(name = "f16sub") + +libc_math_function(name = "f16subf") + +libc_math_function(name = "f16subf128") + +libc_math_function(name = "f16subl") + libc_math_function(name = "fabs") libc_math_function(name = "fabsf") @@ -2143,6 +2318,8 @@ libc_math_function(name = "fabsl") libc_math_function(name = "fabsf128") +libc_math_function(name = "fabsf16") + libc_math_function(name = "fadd") libc_math_function(name = "faddl") @@ -2157,6 +2334,8 @@ libc_math_function(name = "fdiml") libc_math_function(name = "fdimf128") +libc_math_function(name = "fdimf16") + libc_math_function(name = "fdiv") libc_math_function(name = "fdivl") @@ -2192,6 +2371,8 @@ libc_math_function(name = "floorl") libc_math_function(name = "floorf128") +libc_math_function(name = "floorf16") + # TODO: Add fma, fmaf, fmal, fmaf128 functions. libc_math_function(name = "fmax") @@ -2202,6 +2383,8 @@ libc_math_function(name = "fmaxl") libc_math_function(name = "fmaxf128") +libc_math_function(name = "fmaxf16") + libc_math_function(name = "fmaximum") libc_math_function(name = "fmaximumf") @@ -2210,6 +2393,8 @@ libc_math_function(name = "fmaximuml") libc_math_function(name = "fmaximumf128") +libc_math_function(name = "fmaximumf16") + libc_math_function(name = "fmaximum_mag") libc_math_function(name = "fmaximum_magf") @@ -2218,6 +2403,8 @@ libc_math_function(name = "fmaximum_magl") libc_math_function(name = "fmaximum_magf128") +libc_math_function(name = "fmaximum_magf16") + libc_math_function(name = "fmaximum_mag_num") libc_math_function(name = "fmaximum_mag_numf") @@ -2226,6 +2413,8 @@ libc_math_function(name = "fmaximum_mag_numl") libc_math_function(name = "fmaximum_mag_numf128") +libc_math_function(name = "fmaximum_mag_numf16") + libc_math_function(name = "fmaximum_num") libc_math_function(name = "fmaximum_numf") @@ -2234,6 +2423,8 @@ libc_math_function(name = "fmaximum_numl") libc_math_function(name = "fmaximum_numf128") +libc_math_function(name = "fmaximum_numf16") + libc_math_function(name = "fmin") libc_math_function(name = "fminf") @@ -2242,6 +2433,8 @@ libc_math_function(name = "fminl") libc_math_function(name = "fminf128") +libc_math_function(name = "fminf16") + libc_math_function(name = "fminimum") libc_math_function(name = "fminimumf") @@ -2250,6 +2443,8 @@ libc_math_function(name = "fminimuml") libc_math_function(name = "fminimumf128") +libc_math_function(name = "fminimumf16") + libc_math_function(name = "fminimum_mag") libc_math_function(name = "fminimum_magf") @@ -2258,6 +2453,8 @@ libc_math_function(name = "fminimum_magl") libc_math_function(name = "fminimum_magf128") +libc_math_function(name = "fminimum_magf16") + libc_math_function(name = "fminimum_mag_num") libc_math_function(name = "fminimum_mag_numf") @@ -2266,6 +2463,8 @@ libc_math_function(name = "fminimum_mag_numl") libc_math_function(name = "fminimum_mag_numf128") +libc_math_function(name = "fminimum_mag_numf16") + libc_math_function(name = "fminimum_num") libc_math_function(name = "fminimum_numf") @@ -2274,6 +2473,8 @@ libc_math_function(name = "fminimum_numl") libc_math_function(name = "fminimum_numf128") +libc_math_function(name = "fminimum_numf16") + libc_math_function( name = "fmod", additional_deps = [ @@ -2302,6 +2503,13 @@ libc_math_function( ], ) +libc_math_function( + name = "fmodf16", + additional_deps = [ + ":__support_fputil_generic_fmod", + ], +) + libc_math_function( name = "fmul", additional_deps = [ @@ -2321,6 +2529,8 @@ libc_math_function(name = "frexpl") libc_math_function(name = "frexpf128") +libc_math_function(name = "frexpf16") + libc_math_function(name = "fromfp") libc_math_function(name = "fromfpf") @@ -2329,6 +2539,8 @@ libc_math_function(name = "fromfpl") libc_math_function(name = "fromfpf128") +libc_math_function(name = "fromfpf16") + libc_math_function(name = "fromfpx") libc_math_function(name = "fromfpxf") @@ -2337,6 +2549,8 @@ libc_math_function(name = "fromfpxl") libc_math_function(name = "fromfpxf128") +libc_math_function(name = "fromfpxf16") + libc_math_function( name = "fsqrt", additional_deps = [ @@ -2372,6 +2586,8 @@ libc_math_function(name = "getpayloadl") libc_math_function(name = "getpayloadf128") +libc_math_function(name = "getpayloadf16") + libc_math_function(name = "hypot") libc_math_function( @@ -2390,6 +2606,8 @@ libc_math_function(name = "ilogbl") libc_math_function(name = "ilogbf128") +libc_math_function(name = "ilogbf16") + libc_math_function(name = "ldexp") libc_math_function(name = "ldexpf") @@ -2398,6 +2616,8 @@ libc_math_function(name = "ldexpl") libc_math_function(name = "ldexpf128") +libc_math_function(name = "ldexpf16") + libc_math_function(name = "llogb") libc_math_function(name = "llogbf") @@ -2406,6 +2626,8 @@ libc_math_function(name = "llogbl") libc_math_function(name = "llogbf128") +libc_math_function(name = "llogbf16") + libc_math_function(name = "llrint") libc_math_function(name = "llrintf") @@ -2414,6 +2636,8 @@ libc_math_function(name = "llrintl") libc_math_function(name = "llrintf128") +libc_math_function(name = "llrintf16") + libc_math_function(name = "llround") libc_math_function(name = "llroundf") @@ -2422,6 +2646,8 @@ libc_math_function(name = "llroundl") libc_math_function(name = "llroundf128") +libc_math_function(name = "llroundf16") + libc_math_function( name = "log", additional_deps = [ @@ -2450,6 +2676,13 @@ libc_math_function( ], ) +libc_math_function( + name = "logf16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function( name = "log10", additional_deps = [ @@ -2478,6 +2711,13 @@ libc_math_function( ], ) +libc_math_function( + name = "log10f16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function( name = "log1p", additional_deps = [ @@ -2532,6 +2772,13 @@ libc_math_function( ], ) +libc_math_function( + name = "log2f16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function(name = "logb") libc_math_function(name = "logbf") @@ -2540,6 +2787,8 @@ libc_math_function(name = "logbl") libc_math_function(name = "logbf128") +libc_math_function(name = "logbf16") + libc_math_function(name = "lrint") libc_math_function(name = "lrintf") @@ -2548,6 +2797,8 @@ libc_math_function(name = "lrintl") libc_math_function(name = "lrintf128") +libc_math_function(name = "lrintf16") + libc_math_function(name = "lround") libc_math_function(name = "lroundf") @@ -2556,6 +2807,8 @@ libc_math_function(name = "lroundl") libc_math_function(name = "lroundf128") +libc_math_function(name = "lroundf16") + libc_math_function(name = "modf") libc_math_function(name = "modff") @@ -2564,6 +2817,8 @@ libc_math_function(name = "modfl") libc_math_function(name = "modff128") +libc_math_function(name = "modff16") + libc_math_function( name = "nan", additional_deps = [ @@ -2596,6 +2851,14 @@ libc_math_function( ], ) +libc_math_function( + name = "nanf16", + additional_deps = [ + ":__support_str_to_float", + ":errno", + ], +) + libc_math_function(name = "nearbyint") libc_math_function(name = "nearbyintf") @@ -2604,6 +2867,8 @@ libc_math_function(name = "nearbyintl") libc_math_function(name = "nearbyintf128") +libc_math_function(name = "nearbyintf16") + libc_math_function(name = "nextafter") libc_math_function(name = "nextafterf") @@ -2612,6 +2877,8 @@ libc_math_function(name = "nextafterl") libc_math_function(name = "nextafterf128") +libc_math_function(name = "nextafterf16") + libc_math_function(name = "nextdown") libc_math_function(name = "nextdownf") @@ -2620,10 +2887,14 @@ libc_math_function(name = "nextdownl") libc_math_function(name = "nextdownf128") +libc_math_function(name = "nextdownf16") + libc_math_function(name = "nexttoward") libc_math_function(name = "nexttowardf") +libc_math_function(name = "nexttowardf16") + libc_math_function(name = "nexttowardl") libc_math_function(name = "nextup") @@ -2634,6 +2905,8 @@ libc_math_function(name = "nextupl") libc_math_function(name = "nextupf128") +libc_math_function(name = "nextupf16") + libc_math_function( name = "pow", additional_deps = [ @@ -2671,6 +2944,8 @@ libc_math_function(name = "remainderl") libc_math_function(name = "remainderf128") +libc_math_function(name = "remainderf16") + libc_math_function(name = "remquo") libc_math_function(name = "remquof") @@ -2679,6 +2954,8 @@ libc_math_function(name = "remquol") libc_math_function(name = "remquof128") +libc_math_function(name = "remquof16") + libc_math_function(name = "rint") libc_math_function(name = "rintf") @@ -2687,6 +2964,8 @@ libc_math_function(name = "rintl") libc_math_function(name = "rintf128") +libc_math_function(name = "rintf16") + libc_math_function(name = "round") libc_math_function(name = "roundf") @@ -2695,6 +2974,8 @@ libc_math_function(name = "roundl") libc_math_function(name = "roundf128") +libc_math_function(name = "roundf16") + libc_math_function(name = "roundeven") libc_math_function(name = "roundevenf") @@ -2703,6 +2984,8 @@ libc_math_function(name = "roundevenl") libc_math_function(name = "roundevenf128") +libc_math_function(name = "roundevenf16") + libc_math_function(name = "scalbln") libc_math_function(name = "scalblnf") @@ -2711,6 +2994,8 @@ libc_math_function(name = "scalblnl") libc_math_function(name = "scalblnf128") +libc_math_function(name = "scalblnf16") + libc_math_function(name = "scalbn") libc_math_function(name = "scalbnf") @@ -2719,6 +3004,8 @@ libc_math_function(name = "scalbnl") libc_math_function(name = "scalbnf128") +libc_math_function(name = "scalbnf16") + libc_math_function(name = "setpayload") libc_math_function(name = "setpayloadf") @@ -2727,6 +3014,8 @@ libc_math_function(name = "setpayloadl") libc_math_function(name = "setpayloadf128") +libc_math_function(name = "setpayloadf16") + libc_math_function(name = "setpayloadsig") libc_math_function(name = "setpayloadsigf") @@ -2735,6 +3024,8 @@ libc_math_function(name = "setpayloadsigl") libc_math_function(name = "setpayloadsigf128") +libc_math_function(name = "setpayloadsigf16") + libc_math_function( name = "sin", additional_deps = [ @@ -2797,6 +3088,13 @@ libc_math_function( ], ) +libc_math_function( + name = "sinhf16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function( name = "sinpif", additional_deps = [ @@ -2804,6 +3102,15 @@ libc_math_function( ], ) +libc_math_function( + name = "sinpif16", + additional_deps = [ + ":__support_fputil_nearest_integer", + ":__support_fputil_polyeval", + ":sincosf16_utils", + ], +) + libc_math_function( name = "sqrt", additional_deps = [ @@ -2832,6 +3139,13 @@ libc_math_function( ], ) +libc_math_function( + name = "sqrtf16", + additional_deps = [ + ":__support_fputil_sqrt", + ], +) + libc_math_function( name = "tan", additional_deps = [ @@ -2872,6 +3186,13 @@ libc_math_function( ], ) +libc_math_function( + name = "tanhf16", + additional_deps = [ + ":expxf16", + ], +) + libc_math_function(name = "totalorder") libc_math_function(name = "totalorderf") @@ -2880,6 +3201,8 @@ libc_math_function(name = "totalorderl") libc_math_function(name = "totalorderf128") +libc_math_function(name = "totalorderf16") + libc_math_function(name = "totalordermag") libc_math_function(name = "totalordermagf") @@ -2888,6 +3211,8 @@ libc_math_function(name = "totalordermagl") libc_math_function(name = "totalordermagf128") +libc_math_function(name = "totalordermagf16") + libc_math_function(name = "trunc") libc_math_function(name = "truncf") @@ -2896,6 +3221,8 @@ libc_math_function(name = "truncl") libc_math_function(name = "truncf128") +libc_math_function(name = "truncf16") + libc_math_function(name = "ufromfp") libc_math_function(name = "ufromfpf") @@ -2904,6 +3231,8 @@ libc_math_function(name = "ufromfpl") libc_math_function(name = "ufromfpf128") +libc_math_function(name = "ufromfpf16") + libc_math_function(name = "ufromfpx") libc_math_function(name = "ufromfpxf") @@ -2912,6 +3241,8 @@ libc_math_function(name = "ufromfpxl") libc_math_function(name = "ufromfpxf128") +libc_math_function(name = "ufromfpxf16") + ############################## inttypes targets ############################## libc_function( diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl index f298f817af83d..9dc25f95b8e3f 100644 --- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl @@ -8,7 +8,7 @@ load("@bazel_skylib//lib:paths.bzl", "paths") load("@bazel_skylib//lib:selects.bzl", "selects") load(":libc_configure_options.bzl", "LIBC_CONFIGURE_OPTIONS") load(":libc_namespace.bzl", "LIBC_NAMESPACE") -load(":platforms.bzl", "PLATFORM_CPU_ARM64", "PLATFORM_CPU_X86_64") +load(":platforms.bzl", "PLATFORM_CPU_X86_64") def libc_internal_target(name): return name + ".__internal__" diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 00254ba6e99bb..6d151dd11fa65 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -7839,6 +7839,7 @@ cc_library( "include/mlir/Transforms/LoopInvariantCodeMotionUtils.h", "include/mlir/Transforms/OneToNTypeConversion.h", "include/mlir/Transforms/RegionUtils.h", + "include/mlir/Transforms/WalkPatternRewriteDriver.h", ], includes = ["include"], deps = [